Code Documentation for the Project: Crisis Coverage in The Times 1785–2020

Author

anonymous

1 Setup

This code presupposes the following folder structure, and that we start from the folder “./crisis//papers”.

Folder structure for the code to work

1.1 Install packages

Code
###########################################
# 0) INSTALL THE PACKAGES THAT WILL BE USED

  install.packages("Amelia")
  install.packages("babynames")
  install.packages("brms")
  install.packages("broom.mixed")
  install.packages("car")
  install.packages("DescTools")
  install.packages("devtools")
  install.packages("dplyr")
  install.packages("DiagrammeR")
  install.packages("e1071")
  install.packages("effects")
  install.packages("effectsize")
  install.packages("emmeans")
  install.packages("entropy")
  install.packages("erer")
  install.packages("extrafont")
  install.packages("flextable")
  install.packages("forecast")
  install.packages("foreign")
  install.packages("furrr")
  install.packages("gam")
  install.packages("geonames")
  install.packages("ggeffects")
  install.packages("ggplot2")
  install.packages("ggthemes")
  install.packages("Gini")
  install.packages("glmmTMB")
  install.packages("GPArotation")
  install.packages("gridExtra")
  install.packages("Hmisc")
  install.packages("htmlTable")
  install.packages("httpgd")
  install.packages("hunspell")
  install.packages("huxtable")
  install.packages("imputeTS")
  install.packages("influenceR")
  install.packages("irrCAC")
  install.packages("kableExtra")
  install.packages("knitr")
  install.packages("lavaan")
  install.packages("lda")
  install.packages("ldatuning")
  install.packages("lemmar")
  install.packages("lexicon")
  install.packages("lme4")
  install.packages("lmerTest")
  install.packages("lmtest")
  install.packages("lsr")
  install.packages("LSX")
  install.packages("maptools")
  install.packages("marginaleffects")
  install.packages("margins")
  install.packages("MASS")
  install.packages("matrixStats")
  install.packages("mclust")
  install.packages("MCMCglmm")
  install.packages("mediation")
  install.packages("modelsummary")
  install.packages("modeltools")
  install.packages("moderate.mediation")
  install.packages("multcomp")
  install.packages("MuMIn")
  install.packages("mvtnorm")
  install.packages("nFactors")
  install.packages("nlme")
  install.packages("nnet")
  install.packages("openxlsx")
  install.packages("ordinal")
  install.packages("pander")
  install.packages("parallel")
  install.packages("pbapply")
  install.packages("pbkrtest")
  install.packages("pdftools")
  install.packages("performance")
  install.packages("piecewiseSEM")
  install.packages("plotly")
  install.packages("ppcor")
  install.packages("pscl")
  install.packages("psych")
  install.packages("purrr")
  install.packages("qcoder")
  install.packages("qdap")
  install.packages("qdapDictionaries")
  install.packages("quanteda")
  install.packages("quanteda.dictionaries")
  install.packages("quanteda.sentiment")
  install.packages("quanteda.textmodels")
  install.packages("quanteda.textplots")
  install.packages("quanteda.textstats")
  install.packages("RCurl")
  install.packages("readr")
  install.packages("readtext")
  install.packages("remotes")
  install.packages("reshape2")
  install.packages("reticulate")
  install.packages("rJava")
  install.packages("rjson")
  install.packages("rKeywords")
  install.packages("rlist")
  install.packages("rstan")
  install.packages("rstantools")
  install.packages("rstatix")
  install.packages("rsvd")
  install.packages("RTextTools")
  install.packages("rvest")
  install.packages("semTools")
  install.packages("sjmisc")
  install.packages("sjPlot")
  install.packages("sjstats")
  install.packages("spacyr")
  install.packages("stargazer")
  install.packages("stm")
  install.packages("stopwords")
  install.packages("stringdist")
  install.packages("stringi")
  install.packages("stringr")
  install.packages("strucchange")
  install.packages("survey")
  install.packages("texreg")
  install.packages("tibble")
  install.packages("tidyr")
  install.packages("tidytext")
  install.packages("tidyverse")
  install.packages("tinytex")
  install.packages("tokenizers")
  install.packages("topicmodels")
  install.packages("tseries")
  install.packages("twitteR")
  install.packages("udpipe")
  install.packages("vars")
  install.packages("viridis")
  install.packages("viridisLite")
  install.packages("widyr")
  install.packages("WikipediR")
  install.packages("WikipediaR")
  install.packages("wordcloud")
  install.packages("wru")
  install.packages("xlsx")
  install.packages("xtable")
  install.packages("zoo")

  require(devtools)

  install_github("coolbutuseless/ggpattern")
  install_github("burchill/zplyr")

  # install_github("emlaet/wwmetrics")

1.2 Load packages

Code
###########################################
# I) LOAD THE PACKAGES THAT WILL BE USED

  require(Amelia)
  require(babynames)
  require(brms)
  require(broom.mixed)
  require(car)
  require(DescTools)
  require(dplyr)
  require(DiagrammeR)
  require(e1071)
  require(effects)
  require(effectsize)
  require(emmeans)
  require(entropy)
  require(erer)
  require(extrafont)
  require(flextable)
  require(forecast)
  require(foreign)
  require(furrr)
  require(gam)
  require(geonames)
  require(ggeffects)
  require(ggpattern)
  require(ggplot2)
  require(ggthemes)
  require(Gini)
  require(glmmTMB)
  require(GPArotation)
  require(gridExtra)
  require(Hmisc)
  require(htmlTable)
  require(httpgd)
  require(hunspell)
  require(huxtable)
  require(imputeTS)
  require(influenceR)
  require(irrCAC)
  require(kableExtra)
  require(knitr)
  require(lavaan)
  require(lda)
  require(ldatuning)
  require(lemmar)
  require(lexicon)
  require(lme4)
  require(lmerTest)
  require(lmtest)
  require(lsr)
  require(LSX)
  require(maptools)
  require(marginaleffects)
  require(margins)
  require(MASS)
  require(matrixStats)
  require(mclust)
  require(MCMCglmm)
  require(mediation)
  require(modelsummary)
  require(modeltools)
  require(moderate.mediation)
  require(multcomp)
  require(MuMIn)
  require(mvtnorm)
  require(nFactors)
  require(nlme)
  require(nnet)
  require(openxlsx)
  require(ordinal)
  require(pander)
  require(parallel)
  require(pbapply)
  require(pbkrtest)
  require(pdftools)
  require(performance)
  require(piecewiseSEM)
  require(plotly)
  require(ppcor)
  require(pscl)
  require(psych)
  require(purrr)
  require(qcoder)
  require(qdap)
  require(qdapDictionaries)
  require(quanteda)
  require(quanteda.dictionaries)
  require(quanteda.sentiment)
  require(quanteda.textmodels)
  require(quanteda.textplots)
  require(quanteda.textstats)
  require(RCurl)
  require(readr)
  require(readtext)
  require(remotes)
  require(reshape2)
  require(reticulate)
  require(rJava)
  require(rjson)
  require(rKeywords)
  require(rlist)
  require(rstan)
  require(rstantools)
  require(rstatix)
  require(rsvd)
  require(RTextTools)
  require(rvest)
  require(semTools)
  require(sjmisc)
  require(sjPlot)
  require(sjstats)
  require(spacyr)
  require(stargazer)
  require(stm)
  require(stopwords)
  require(stringdist)
  require(stringi)
  require(stringr)
  require(strucchange)
  require(survey)
  require(texreg)
  require(tibble)
  require(tidyr)
  require(tidytext)
  require(tidyverse)
  require(tinytex)
  require(tokenizers)
  require(topicmodels)
  require(tseries)
  require(twitteR)
  require(udpipe)
  require(vars)
  require(viridis)
  require(viridisLite)
  require(widyr)
  require(WikipediR)
  require(WikipediaR)
  require(wwmetrics)
  require(wordcloud)
  require(wru)
  require(xlsx)
  require(xtable)
  require(zoo)
  require(zplyr)

1.3 Toggle on parallel processing

Code
###########################################
# II) TOGGLE ON PARALLEL PROCESSING
  no_cores <- detectCores(logical = TRUE)
  cl <- makeCluster(no_cores-1)  
  registerDoParallel(cl)  

1.4 Create user-defined functions

Code
###########################################
# III) CREATE USER-DEFINED FUNCTIONS

## Spellcheck functions

### If hunspell stuggests several words that have the same string distance (=deviation between the original word and the suggested word),  
### 1 word is sampled randomly from the set of minimum string distance words (alternatively, the first suggestion can be extracted)

  which.min <- function(x){
          minx <- min(x,na.rm=TRUE)
          posmin <- sample(which(x==minx),1)
          return(posmin)
      }

### The auto.spellcheck function will identify "bad" words, ask for suggestions for each of these words, and select the FIRST (=best) suggestion. 
### The bad words are then replaced by the suggestions, and the corrected text is returned.

  auto.spellcheck <- function(text,dict,empty){
    require(stringr)
    require(hunspell)
    corrected_text <- NA
    if (is.character(text) & !is.na(text) & (nchar(text)>100))
      {
  #     text <- str_remove_all(text,"[-–—ũẽĩŨĨʒ[:punct:]]")
      complete <- hunspell(text,dict=empty)[[1]]
      bad <- hunspell(text,dict=dict)[[1]]
      best <- hunspell_suggest(bad,dict=dict)
      nalt <- unlist(lapply(best,length))
      bad_filtered <- bad[nalt>0]
      best_filtered <- unlist(lapply(best[nalt>0],`[[`,1))
      x <- 0
      for (i in 1:length(bad_filtered)){
        x <- c(x,match(bad_filtered[i],complete[(x[i]+1):(length(complete))])+x[i])
        }
      x <- x[-1]
      corrected <- complete
      corrected[x] <- best_filtered
      corrected_text <- paste(corrected,collapse=" ")
      }
    return(corrected_text)
    }

### Replace punctuation with words

  cleanPunct <- function(text){
      text <- str_remove_all(text,"[[:digit:]]")
      text <- str_replace_all(text,"\\.", " xPUNKTx ")
      text <- str_replace_all(text,";", " xSEMIx ")
      text <- str_replace_all(text,", "," xKOMMAx ")
      text <- str_replace_all(text,"!", " xAUSRUFx ")
      text <- str_replace_all(text,"\\?", " xFRAGEx ")
      return(text)
      }

### Replace the punctuation words again with the initial punctuation

  revertPunct <- function(text){
      text <- str_replace_all(text,"xPUNKTx", "\\.")
      text <- str_replace_all(text,"xSEMIx", ";")
      text <- str_replace_all(text,"xKOMMAx", ",")
      text <- str_replace_all(text,"xAUSRUFx", "!")
      text <- str_replace_all(text,"xFRAGEx", "\\?")
      return(text)
      }

### Automatically spellcheck a text with punctuation removed

  auto.spellcheck.punct <- function(text,dict,empty){
    require(stringr)
    require(hunspell)
    corrected_text <- NA
    if (is.character(text) & !is.na(text) & (nchar(text)>100))
      {
      complete <- quanteda::tokens(text,remove_numbers=TRUE)[[1]]
      bad <- hunspell(text,dict=empty)
      best <- hunspell_suggest(bad[[1]],dict=dict)
      nalt <- unlist(lapply(best,length))
      bad_filtered <- unlist(bad)[nalt>0]
      best_filtered <- unlist(lapply(best[nalt>0],`[[`,1))
      x <- 0
      for (i in 1:length(bad_filtered)){
        x <- c(x,match(bad_filtered[i],complete[(x[i]+1):(length(complete))]) + x[i])
        }
      x <- x[-1]
      corrected <- complete
      corrected[x] <- best_filtered
      corrected_text <- paste(corrected,collapse=" ")
      }
    return(corrected_text)
    }

### Mark words that are not included in a specified dictionary

  marknondict <- function(word,dictionary){
    wordcheck <- word%in%dictionary
    return(wordcheck)
    }

### UNFINISHED FUNCTION: Split words into smaller parts that are included in a specified dictionary

  split_words <- function(string,dictionary1){
    return_string <- string
    position_unrecognized   <- which(!(string%in%dictionary1)&!(str_detect(string,"^[:digit:]+[:alpha:]{0,2}")))
    if(length(position_unrecognized)>0)
      {
      strd <- str_match(string[position_unrecognized],pattern=endict)
      strd[!is.na(strd)]
      }
    }

### Substitute words with best-fitting words from a specified dictionary

  substitute_words <- function(string,dictionary1,dictionary2,dictionary2.1){
    return_string <- string
    dictionary2.2 <- dictionary2.1
    dictionary2.2[,2] <- dictionary2.1[,2]+runif(min=0,max=1,n=length(dictionary2.1[,2]))
    position_unrecognized   <- which(!(string%in%dictionary1)&!(str_detect(string,"^[:digit:]+[:alpha:]{0,2}")))
    if(length(position_unrecognized)>0)
      {
      position_threshold        <- nchar(string[position_unrecognized])/10
    #   text_reduced            <- string[position_unrecognized]
      sdm   <- stringdistmatrix(dictionary2,string[position_unrecognized],method="osa",weight=c(0.75,1.0,0.50,0.75))
      romi <- colMins(sdm)
      wf <- matrix(rep(dictionary2.2[,2],times=dim(sdm)[2]),nrow=dim(sdm)[1])
      strd <- (sdm==t(matrix(rep(romi,times=dim(sdm)[1]),nrow=dim(sdm)[2])))
      posi <- match(colMaxs(wf*strd),dictionary2.2[,2])
      return_string[position_unrecognized[romi<position_threshold]] <- dictionary2[posi[romi<position_threshold]]
      }
    return(list(return_string))
  }

### Clean up possible OCR errors based on a dictionary of all "legal" words

  ocr.tok <- function(tokens,dictionary){
      ndoc <- length(tokens)
      for (d in 1:ndoc)
      {
        substitute_words(tokens[[d]],dictionary)
      }
      y <- character()
      if (dim(x)[1]>0) {
      for (j in 1:dim(x)[1]){   # a loop that runs through all documents.
        text_temp <- x[j,"text"]
        tok_temp  <- quanteda::tokens(str_to_lower(text_temp),remove_punct=TRUE, remove_symbols=TRUE, remove_numbers=FALSE, split_hyphens=TRUE,remove_separators=TRUE)[[1]]
        goodpos <- which(tok_temp%in%fullwordlist) # marker which words are recognized either as words (endict) or names (person_names,surnames).
        badpos  <- which(!(tok_temp%in%fullwordlist))  # marker which words are not recognized as words (endict) or names (person_names,surnames).
        good <- tok_temp[goodpos] # list of all words that are unproblematic. Will be used later to compile a full list of all words again.
        bad <- tok_temp[badpos] # list of all words unrecognized that should be substituted if possible. 
        wole <- nchar(bad) # Word length of the words that are unrecognized. This is important to calculate the maximum permissible number of deviations for substitution.
        thresh <- ceiling(wole/3) # Word-specific threshold for deciding whether to substitute it for the best-fitting word in the dictionary or not.
        for (k in 1:length(bad))    # a loop that runs through all "bad" words in a document.
          {
          if (length(bad)>0) {      # condition to either proceed if there are any bad words, or jump to the next document if there are no bad words.
            stdi.raw <- stringdist(dict,tok_temp[badpos[k]],method="osa",weight=c(1,1,1,1)) # Calculates the deviation between the bad word to be substituted and all words in the dictionary of English words.
            stdi.pos <- stringdist(dict,paste0(tok_temp[badpos[k]],tok_temp[badpos[k]+1]),method="osa",weight=c(1,1,1,1)) # The same calculation as in stdi.raw, but adds the next word to the current bad word. If there is a linebreak, a word may have been broken down into two broken words if the hyphen was not recognized.
            stdi.pre <- stringdist(dict,paste0(tok_temp[badpos[k]-1],tok_temp[badpos[k]]),method="osa",weight=c(1,1,1,1)) # The same calculation as in stdi.raw, but adds the immediately preceding word to the current bad word. For the same logic, for the word ending up at the start rather than the end of the line.
            min.stdi.raw <- min(stdi.raw,na.rm=T) # Gives the lowest observed raw deviation.
            min.stdi.pos <- min(stdi.pos,na.rm=T) # Gives the lowest observed deviation when combined with the next word.
            min.stdi.pre <- min(stdi.pre,na.rm=T) # Gives the lowest observed deviation when combined with the previous word.
            stdi <- if(min.stdi.raw<=min.stdi.pos & min.stdi.raw<=min.stdi.pre) {stdi.raw} else 
                  if (min.stdi.pos<=min.stdi.pre) {stdi.pos} else 
                  {stdi.pre} # choose the stdi list with the lowest deviation
            loc <- which(stdi==min(stdi)) # Position of the lowest values in stdi (maybe several)
            locsim <- loc[which(nchar(dict[loc])==wole[k])] # Position of the lowest values in stdi that have the same word length as the target word (maybe several)
            if(length(locsim)>0){ # is there any location with similar word length as original word?
              if(stdi[locsim][1]<=thresh[k]){ # lower than threshold?
              tok_temp[badpos[k]] <- sample(x=dict[locsim],size=1)} else {tok_temp[badpos[k]] <- " "}} # draw a sample of one from the set of words that have the same length and fit optimally (best fitters of same length)
            if(length(loc)>0){ # is there any location?
              if(stdi[loc][1]<=thresh[k]){ # lower than threshold?
              tok_temp[badpos[k]] <- sample(x=dict[loc],size=1)}
              else{tok_temp[badpos[k]] <- " "}} # draw a sample of one from the set of words that fit just as good (best fitters)
  #             print(paste0(k,"/",length(bad)))
          flush.console()
            } # note that the result from loc will be overwritten with the result from locsim if locsim >0
        }
        y <- c(y,paste(tok_temp,collapse=" "))
        print(j)
        flush.console()
        }
    }
  return(y)
  }

### Extracts the first nwords from a text represented as a character vector

  firstwords <- function(x,nwords){
    y <- paste(x[1:nwords],collapse=" ")
  }

### Counts the number of matches between a pattern and a character vector

  str_counter <- function(string,pattern){
      sum(!is.na(str_match(string=string,pattern=pattern)),na.rm=TRUE)
  }

# Metadata procedures

## Dates

  monnb <- function(d) { lt <- as.POSIXlt(as.Date(d, origin="1900-01-01")); 
                          lt$year*12 + lt$mon } 
  mondf <- function(d1, d2) { monnb(d2) - monnb(d1) }                         

  yearnb <- function(d) { lt <- as.POSIXlt(as.Date(d, origin="1900-01-01")); 
                          lt$year} 
  yeardf <- function(d1, d2) { yearnb(d2) - yearnb(d1) }                          

  ## Create moving average time series based on daily time series, can specify alignment of moving averages

  create_dma <- function(x,target,baselines,align){
    data <- x
    target.name <- paste0("DMA",target)
    baseline.names <- paste0("DMA",baselines)
    target.name.s <- paste0("DMA",target,".share")
    baseline.names.s <- paste0("DMA",baselines,".share")
    target.name.a <- paste0("a",target)
    baseline.names.a <- paste0("a",baselines)
    first <- 1
    last <- dim(data)[1]
    data[,target.name] <- 0
    data[,target.name.s] <- 0
    data[,target.name.a] <- 0
    for (j in 1:length(baseline.names))
      {
      data[,baseline.names[j]] <- 0
      data[,baseline.names.s[j]] <- 0
      data[,baseline.names.a[j]] <- 0
      }
    data[,target.name] <- rollmean(k=target,x=data[,"count0"],align=align,fill=c(NA,NA,NA))
    data[,target.name.s] <- rollmean(k=target,x=data[,"share0"],align=align,fill=c(NA,NA,NA))
    if (align=="right")
      {
      data[,target.name][first:(target-1)] <- data[,target.name][target]
      data[,target.name.s][first:(target-1)] <- data[,target.name.s][target]
      }
    if (align=="left")
      {
      data[,target.name][(last-target+2):last] <- data[,target.name][last-target+1]
      data[,target.name.s][(last-target+2):last] <- data[,target.name.s][last-target+1]
      }
    if (align=="center")
      {
      data[,target.name][1:(round(target/2))] <- data[,target.name][round(target/2)+1]
      data[,target.name.s][1:round(target/2)] <- data[,target.name.s][round(target/2)+1]
      data[,target.name][(last-round(target/2)+1):last] <- data[,target.name][(last-round(target/2))]
      data[,target.name.s][(last-round(target/2)+1):last] <- data[,target.name.s][(last-round(target/2))]
      }             
    for (j in 1:length(baseline.names))
      {
      data[,baseline.names[j]] <- rollmean(k=baselines[j],x=data[,"count0"],align=align,fill=c(NA,NA,NA))
      data[,baseline.names.s[j]] <- rollmean(k=baselines[j],x=data[,"share0"],align=align,fill=c(NA,NA,NA))
      if (align=="right")
        {
        data[,baseline.names[j]][first:(baselines[j]-1)] <- data[,baseline.names[j]][baselines[j]]
        data[,baseline.names.s[j]][first:(baselines[j]-1)] <- data[,baseline.names.s[j]][baselines[j]]          
        }
      if (align=="left")
        {
        data[,baseline.names[j]][last-baselines[j]+1:last] <- data[,baseline.names[j]][last-baselines[j]]
        data[,baseline.names.s[j]][last-baselines[j]+1:last] <- data[,baseline.names.s[j]][last-baselines[j]]           
        }   
      if (align=="center")
        {
        data[,baseline.names[j]][first:(round(baselines[j]/2))] <- data[,baseline.names[j]][round(baselines[j]/2)+1]
        data[,baseline.names.s[j]][first:(round(baselines[j]/2))] <- data[,baseline.names.s[j]][round(baselines[j]/2)+1]            
        data[,baseline.names[j]][(last-round(baselines[j]/2)+1):last] <- data[,baseline.names[j]][(last-round(baselines[j]/2))]
        data[,baseline.names.s[j]][(last-round(baselines[j]/2)+1):last] <- data[,baseline.names.s[j]][(last-round(baselines[j]/2))]
        }   
      }
    return(data)    
    }

# The same function without the option to pick right, left or center alignment of the moving averages (all right-aligned)

  create_dma2 <- function(x,topics,target,baselines){
    data <- x
    target.name <- paste0("DMA",target)
    baseline.names <- paste0("DMA",baselines)
    target.name.s <- paste0("DMA",target,".share")
    baseline.names.s <- paste0("DMA",baselines,".share")
    target.name.a <- paste0("a",target)
    baseline.names.a <- paste0("a",baselines)
    data[,target.name] <- 0
    data[,target.name.s] <- 0
    data[,target.name.a] <- 0
    for (j in 1:length(baseline.names))
      {
      data[,baseline.names[j]] <- 0
      data[,baseline.names.s[j]] <- 0
      data[,baseline.names.a[j]] <- 0
      }
    for (i in 1:length(topics))
      {
      data.to <- subset(data,topic==topics[i])
      data.to[,target.name] <- rollmean(k=target,x=data.to[,"count0"],align="right",fill=c(NA,NA,NA))
      data.to[,target.name.s] <- rollmean(k=target,x=data.to[,"share0"],align="right",fill=c(NA,NA,NA))
      data.to[,target.name][1:(target-1)] <- data.to[,target.name][target]
      data.to[,target.name.s][1:(target-1)] <- data.to[,target.name.s][target]
      for (j in 1:length(baseline.names))
        {
        data.to[,baseline.names[j]] <- rollmean(k=baselines[j], x=data.to[,"count0"],align="right",fill=c(NA,NA,NA))
        data.to[,baseline.names[j]][1:(target-1)] <- data.to[, baseline.names[j]][baselines[j]]
        data.to[,baseline.names.s[j]] <- rollmean(k=baselines[j], x=data.to[,"share0"],align="right",fill=c(NA,NA,NA))
        data.to[,baseline.names.s[j]][1:(target-1)] <- data.to[, baseline.names.s[j]][baselines[j]]         
        }
      data[data$topic==topics[i],] <- data.to
      }
      print(i)
      flush.console()
    return(data)    
    }


# Automated text analysis procedures

## Headlines

  list_reduce <- function(x,text.headlines){
    y <- as.character(x[[1]])
    return(y)
    }

## Crisis news wave detection, labelling, validation

  find_newswaves2 <- function(x,target,baselines)
    {
      data <- x
      target.name <- paste0("DMA",target)   
      target.name.s <- paste0("DMA",target,".share")    
      target.name.a <- paste0("a",target)   
      baseline.names <- paste0("DMA",baselines)
      baseline.names.s <- paste0("DMA",baselines,".share")
      baseline.names.a <- paste0("a",baselines)
      baseline.names.v <- paste0("v",baselines)
      baseline.names.n <- paste0("n",baselines)
      for (j in 1:length(baselines))
        {
          data[,baseline.names.a[j]] <- 1*(data$target.name>data$baseline.names[j])
        }
      data[,"a"] <- rowSums(data[,baselines.names.a])
      data[,"a.lag1"] <- c(NA,data[,"a"][-(length(data$a)-1)])
      for (j in 1:length(baselines))
        {
          data[,baseline.names.v[j]] <- (data[,target.name.a]) * (data[,target.name.s]-data[,baseline.names.s[j]]) 
          data[,baseline.names.n[j]] <- (data[,target.name.a]) * (data[,target.name]-data[,baseline.names[j]]) 
        }
      data$new.topic <- 0
      data$new.topic[seq(1,length(data$new.topic),86197)] <- 1
      data$turn.on <- 1*(data$a.lag1<3 & data$a>2)
      data$turn.off <- 1*(data$a<3 & data$a.lag1>2)
      data$pnw <- (data$turn.on==1 | data$a>0)
      data$pnw.no <- cumsum(data$turn.on)
      data$a.pnw.no <- (data$a>0)*data$pnw.no
      return(data)
    }

  find_newswaves <- function(x,target,baselines)
    {
    data <- x
    target.name <- paste0("DMA",target) 
    target.name.s <- paste0("DMA",target,".share")  
    target.name.a <- paste0("a",target) 
    baseline.names <- paste0("DMA",baselines)
    baseline.names.s <- paste0("DMA",baselines,".share")
    baseline.names.a <- paste0("a",baselines)
    baseline.names.v <- paste0("v",baselines)
    baseline.names.n <- paste0("n",baselines)
    for (j in 1:length(baselines))
      {
      data[,baseline.names.a[j]] <- NA
      data[,baseline.names.a[j]] <- 1*(data[,target.name] > data[,baseline.names[j]])
      }
    data[,"a"] <- rowSums(data[,baseline.names.a])
    data[,"a.lag1"] <- c(NA,data[,"a"][-(length(data$a)-1)])
    for (j in 1:length(baselines))
      {
      data[,baseline.names.v[j]] <- NA 
      data[,baseline.names.n[j]] <- NA
      data[,baseline.names.v[j]] <- (data[,baseline.names.a[j]]) * (data[,target.name.s]-data[,baseline.names.s[j]]) 
      data[,baseline.names.n[j]] <- (data[,baseline.names.a[j]]) * (data[,target.name]-data[,baseline.names[j]]) 
      }
    data$new.topic <- 0
    data$new.topic[seq(1,length(data$new.topic),86197)] <- 1
    data$turn.on <- 1*(data$a.lag1<3 & data$a>2)
    data$turn.off <- 1*(data$a<3 & data$a.lag1>2)
    data$pnw <- (data$turn.on==1 | data$a>2)
    data$pnw.no <- cumsum(data$turn.on)
    data$a.pnw.no <- (data$a>2)*data$pnw.no
    return(data)
    }

  waveanalyzer6 <- function(x){
    data <- x
    nwaves <- max(data$o30id,na.rm=T)
    wavedesc <- data.frame(id=NA,volume=NA,duration=NA,intensity=NA,start=NA,end=NA)
    for (w in 1:nwaves)
    {
      wave <- subset(data,o30id==w)
      waveend <- subset(wave,o30dur==max(wave$o30dur,na.rm=TRUE))
      wavedesc[w,"id"] <- w
      wavedesc[w,"duration"] <- waveend$o30dur[1]
      wavedesc[w,"volume"] <- mean(colSums(as.matrix(wave[,c("o30d90", "o30d180", "o30d365", "o30d1461", "o30d3652")]),na.rm=TRUE),na.rm=TRUE)
      wavedesc[w,"intensity"] <- mean(colMeans(as.matrix(wave[,c("o30d90", "o30d180", "o30d365", "o30d1461", "o30d3652")]),na.rm=TRUE),na.rm=TRUE)
      wavedesc[w,"max.intensity"] <- mean(colMaxs(as.matrix(wave[,c("o30d90", "o30d180", "o30d365", "o30d1461", "o30d3652")]),na.rm=TRUE),na.rm=TRUE)
      wavedesc[w,"variability"] <- mean(colSds(as.matrix(wave[,c("o30d90", "o30d180", "o30d365", "o30d1461", "o30d3652")]),na.rm=TRUE),na.rm=TRUE)
      wavedesc[w,"start"] <- waveend$o30sta
      wavedesc[w,"end"] <- waveend$o30end
      wavedesc[w,"baseline30"] <- mean(subset(wave,o30dur>0)$n30,na.rm=TRUE)
      wavedesc[w,"baseline90"] <- mean(subset(wave,o30dur>0)$n90,na.rm=TRUE)
      wavedesc[w,"baseline180"] <- mean(subset(wave,o30dur>0)$n180,na.rm=TRUE)
      wavedesc[w,"baseline365"] <- mean(subset(wave,o30dur>0)$n365,na.rm=TRUE)
      wavedesc[w,"baseline1461"] <- mean(subset(wave,o30dur>0)$n1461,na.rm=TRUE)
      wavedesc[w,"baseline3652"] <- mean(subset(wave,o30dur>0)$n3652,na.rm=TRUE)
    }
    return(wavedesc)
  }

  waveanalyzer5 <- function(x){
    data <- x
    minwaves <- min(data$a.pnw.no,na.rm=T)
    maxwaves <- max(data$a.pnw.no,na.rm=T)
    wavedesc <- data.frame(id=NA,volume=NA,duration=NA,intensity=NA,start=NA,end=NA)
    for (w in minwaves:maxwaves)
    {
      wave <- subset(data,a.pnw.no==w)
  #     waveend <- subset(wave,o30dur==max(wave$o30dur,na.rm=TRUE))
      wavedesc[w-minwaves+1,"id"] <- w
      wavedesc[w-minwaves+1,"duration"] <- dim(wave)[1]
      if (wavedesc[w-minwaves+1,"duration"]>0) {
        wavedesc[w-minwaves+1,"volume"] <- mean(colSums(as.matrix(wave[,c("n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
        wavedesc[w-minwaves+1,"intensity"] <- mean(colMeans(as.matrix(wave[,c("n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
        wavedesc[w-minwaves+1,"max.intensity"] <- mean(colMaxs(as.matrix(wave[,c("n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
        wavedesc[w-minwaves+1,"variability"] <- mean(colSds(as.matrix(wave[,c("n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
        wavedesc[w-minwaves+1,"start"] <- wave$day[1]
        wavedesc[w-minwaves+1,"end"] <- wave$day[length(wave$day)]
        wavedesc[w-minwaves+1,"baseline30"] <- mean(wave$i.count.30,na.rm=TRUE)
        wavedesc[w-minwaves+1,"baseline90"] <- mean(wave$i.count.90,na.rm=TRUE)
        wavedesc[w-minwaves+1,"baseline180"] <- mean(wave$i.count.180,na.rm=TRUE)
        wavedesc[w-minwaves+1,"baseline365"] <- mean(wave$i.count.365,na.rm=TRUE)
        wavedesc[w-minwaves+1,"baseline730"] <- mean(wave$i.count.730,na.rm=TRUE)
        wavedesc[w-minwaves+1,"baseline1825"] <- mean(wave$i.count.1825,na.rm=TRUE)
        wavedesc[w-minwaves+1,"topic250"] <- wave$topic[1] }
      if (w%in%thousands) print(w)
      flush.console()
    }
    return(wavedesc)
  }

  waveanalyzer4 <- function(x){
    data <- x
    minwaves <- min(data$a.pnw.no,na.rm=T)
    maxwaves <- max(data$a.pnw.no,na.rm=T)
    wavedesc <- data.frame(id=NA,volume=NA,duration=NA,intensity=NA,start=NA,end=NA)
    for (w in minwaves:maxwaves)
    {
      wave <- subset(data,a.pnw.no==w)
  #     waveend <- subset(wave,o30dur==max(wave$o30dur,na.rm=TRUE))
      wavedesc[w-minwaves+1,"id"] <- w
      wavedesc[w-minwaves+1,"duration"] <- dim(wave)[1]
      if (wavedesc[w-minwaves+1,"duration"]>0) {
        wavedesc[w-minwaves+1,"volume"] <- mean(colSums(as.matrix(wave[,c("n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
        wavedesc[w-minwaves+1,"intensity"] <- mean(colMeans(as.matrix(wave[,c("n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
        wavedesc[w-minwaves+1,"max.intensity"] <- mean(colMaxs(as.matrix(wave[,c("n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
        wavedesc[w-minwaves+1,"variability"] <- mean(colSds(as.matrix(wave[,c("n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
        wavedesc[w-minwaves+1,"start"] <- wave$day[1]
        wavedesc[w-minwaves+1,"end"] <- wave$day[length(wave$day)]
        wavedesc[w-minwaves+1,"baseline30"] <- mean(wave$i.count.30,na.rm=TRUE)
        wavedesc[w-minwaves+1,"baseline90"] <- mean(wave$i.count.90,na.rm=TRUE)
        wavedesc[w-minwaves+1,"baseline180"] <- mean(wave$i.count.180,na.rm=TRUE)
        wavedesc[w-minwaves+1,"baseline365"] <- mean(wave$i.count.365,na.rm=TRUE)
        wavedesc[w-minwaves+1,"baseline730"] <- mean(wave$i.count.730,na.rm=TRUE)
        wavedesc[w-minwaves+1,"baseline1825"] <- mean(wave$i.count.1825,na.rm=TRUE)
        wavedesc[w-minwaves+1,"topic50"] <- wave$topic[1] }
      if (w%in%thousands) print(w)
      flush.console()
    }
    return(wavedesc)
  }

  waveanalyzer3 <- function(x){
    data <- x
    minwaves <- min(data$a.pnw.no,na.rm=T)
    maxwaves <- max(data$a.pnw.no,na.rm=T)
    wavedesc <- data.frame(id=NA,volume=NA,duration=NA,intensity=NA,start=NA,end=NA)
    for (w in minwaves:maxwaves)
    {
      wave <- subset(data,a.pnw.no==w)
  #     waveend <- subset(wave,o30dur==max(wave$o30dur,na.rm=TRUE))
      wavedesc[w-minwaves+1,"id"] <- w
      wavedesc[w-minwaves+1,"duration"] <- dim(wave)[1]
      wavedesc[w-minwaves+1,"volume"] <- mean(colSums(as.matrix(wave[,c("n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
      wavedesc[w-minwaves+1,"intensity"] <- mean(colMeans(as.matrix(wave[,c("n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
      wavedesc[w-minwaves+1,"max.intensity"] <- mean(colMaxs(as.matrix(wave[,c("n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
      wavedesc[w-minwaves+1,"variability"] <- mean(colSds(as.matrix(wave[,c("n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
      wavedesc[w-minwaves+1,"start"] <- wave$day[1]
      wavedesc[w-minwaves+1,"end"] <- wave$day[length(wave$day)]
      wavedesc[w-minwaves+1,"baseline30"] <- mean(wave$i.count.30,na.rm=TRUE)
      wavedesc[w-minwaves+1,"baseline90"] <- mean(wave$i.count.90,na.rm=TRUE)
      wavedesc[w-minwaves+1,"baseline180"] <- mean(wave$i.count.180,na.rm=TRUE)
      wavedesc[w-minwaves+1,"baseline365"] <- mean(wave$i.count.365,na.rm=TRUE)
      wavedesc[w-minwaves+1,"baseline730"] <- mean(wave$i.count.730,na.rm=TRUE)
      wavedesc[w-minwaves+1,"baseline1825"] <- mean(wave$i.count.1825,na.rm=TRUE)
      wavedesc[w-minwaves+1,"topic20"] <- wave$topic[1]
      if (w%in%thousands) print(w)
      flush.console()
    }
    return(wavedesc)
  }

  waveanalyzer2 <- function(x){
    data <- x
    minwaves <- min(data$a.pnw.no,na.rm=T)
    maxwaves <- max(data$a.pnw.no,na.rm=T)
    wavedesc <- data.frame(id=NA,volume=NA,duration=NA,intensity=NA,start=NA,end=NA)
    for (w in minwaves:maxwaves)
    {
      wave <- subset(data,a.pnw.no==w)
  #     waveend <- subset(wave,o30dur==max(wave$o30dur,na.rm=TRUE))
      wavedesc[w-minwaves+1,"id"] <- w
      wavedesc[w-minwaves+1,"duration"] <- dim(wave)[1]
      wavedesc[w-minwaves+1,"volume"] <- mean(colSums(as.matrix(wave[,c("n30", "n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
      wavedesc[w-minwaves+1,"intensity"] <- mean(colMeans(as.matrix(wave[,c("n30", "n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
      wavedesc[w-minwaves+1,"max.intensity"] <- mean(colMaxs(as.matrix(wave[,c("n30", "n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
      wavedesc[w-minwaves+1,"variability"] <- mean(colSds(as.matrix(wave[,c("n30", "n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
      wavedesc[w-minwaves+1,"start"] <- wave$day[1]
      wavedesc[w-minwaves+1,"end"] <- wave$day[length(wave$day)]
      wavedesc[w-minwaves+1,"baseline30"] <- mean(wave$i.count.30,na.rm=TRUE)
      wavedesc[w-minwaves+1,"baseline90"] <- mean(wave$i.count.90,na.rm=TRUE)
      wavedesc[w-minwaves+1,"baseline180"] <- mean(wave$i.count.180,na.rm=TRUE)
      wavedesc[w-minwaves+1,"baseline365"] <- mean(wave$i.count.365,na.rm=TRUE)
      wavedesc[w-minwaves+1,"baseline730"] <- mean(wave$i.count.730,na.rm=TRUE)
      wavedesc[w-minwaves+1,"baseline1825"] <- mean(wave$i.count.1825,na.rm=TRUE)
      wavedesc[w-minwaves+1,"topic250"] <- wave$topic[1]
      if (w%in%thousands) print(w)
      flush.console()
    }    
    return(wavedesc)
  }

  waveanalyzer <- function(x){
    data <- x[,c("a.pnw.no", "n30", "n90", "n180", "n365", "n730", "n1825", "day", "DMA30", "DMA90", "DMA180", "DMA365", "DMA730", "DMA1825", "topic")]
    minwaves <- min(data$a.pnw.no,na.rm=T)
    maxwaves <- max(data$a.pnw.no,na.rm=T)
    wavedesc <- data.frame(id=NA,volume=NA,duration=NA,intensity=NA,start=NA,end=NA)
    for (w in minwaves:maxwaves)
    {
      wave <- subset(data,a.pnw.no==w)
  #     waveend <- subset(wave,o30dur==max(wave$o30dur,na.rm=TRUE))
      wavedesc[w-minwaves+1,"id"] <- w
      wavedesc[w-minwaves+1,"duration"] <- dim(wave)[1]
      wavedesc[w-minwaves+1,"volume"] <- mean(colSums(as.matrix(wave[,c("n30", "n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
      wavedesc[w-minwaves+1,"intensity"] <- mean(colMeans(as.matrix(wave[,c("n30", "n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
      wavedesc[w-minwaves+1,"max.intensity"] <- mean(colMaxs(as.matrix(wave[,c("n30", "n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
      wavedesc[w-minwaves+1,"variability"] <- mean(colSds(as.matrix(wave[,c("n30", "n90", "n180", "n365", "n730", "n1825")]),na.rm=TRUE),na.rm=TRUE)
      wavedesc[w-minwaves+1,"start"] <- wave$day[1]
      wavedesc[w-minwaves+1,"end"] <- wave$day[length(wave$day)]
      wavedesc[w-minwaves+1,"baseline30"] <- mean(wave$DMA30,na.rm=TRUE)
      wavedesc[w-minwaves+1,"baseline90"] <- mean(wave$DMA90,na.rm=TRUE)
      wavedesc[w-minwaves+1,"baseline180"] <- mean(wave$DMA180,na.rm=TRUE)
      wavedesc[w-minwaves+1,"baseline365"] <- mean(wave$DMA365,na.rm=TRUE)
      wavedesc[w-minwaves+1,"baseline730"] <- mean(wave$DMA730,na.rm=TRUE)
      wavedesc[w-minwaves+1,"baseline1825"] <- mean(wave$DMA1825,na.rm=TRUE)
      wavedesc[w-minwaves+1,"topic250"] <- wave$topic[1]
      if (w%in%thousands) print(w)
      flush.console()
    }
    return(wavedesc)
  }


## Compares the vocabulary during the crisis news wave to articles about the same topic in a specified time frame prior to the crisis news wave and after the crisis news wave.

  compareVocabulary3 <- function(data=data,pre=pre,post=post,start=start,end=end,nover=nover,nunder=nunder,topic=topic,area=area)
     {
      startdate <- as.POSIXct(start*60*60*24,origin="0000-01-01")
      enddate   <- as.POSIXct(end*60*60*24,origin="0000-01-01")
      # pre.bag.dtm     <- dfm_subset(data,days>(start-pre) & days<start) # without topic requirement
      # target.bag.dtm  <- dfm_subset(data,days>(start) & days<end)   # without topic requirement
      # post.bag.dtm    <- dfm_subset(data,days>(end) & days<(end+post))  # without topic requirement
      pre.bag.dtm   <- dfm_subset(data,days>(start-pre) & days<start)
      target.bag.dtm    <- dfm_subset(data,days>(start) & days<end  & wide.STM[,topic.eq[as.numeric(topic),"topicX"]]>0.005)
      post.bag.dtm  <- dfm_subset(data,days>(end) & days<(end+post))
      full.bag  <- colSums(pre.bag.dtm)+colSums(target.bag.dtm)+colSums(post.bag.dtm)
      pre.bag   <- ifelse(colSums(pre.bag.dtm)[full.bag>0]>0,colSums(pre.bag.dtm)[full.bag>0],1)
      target.bag    <- ifelse(colSums(target.bag.dtm)[full.bag>0]>0,colSums(target.bag.dtm)[full.bag>0],1)
      post.bag  <- ifelse(colSums(post.bag.dtm)[full.bag>0]>0,colSums(post.bag.dtm)[full.bag>0],1)
      pre.pr        <- pre.bag/sum(pre.bag,na.rm=TRUE)
      target.pr     <- target.bag/sum(target.bag,na.rm=TRUE)
      post.pr   <- post.bag/sum(post.bag,na.rm=TRUE)

      ### Pre-to-target
      pre.prfactor  <- (target.pr/pre.pr)
      pre.prraw     <- (target.pr-pre.pr)
      pre.fraw          <- (target.bag-pre.bag)
      pre.voc.change <- data.frame(feature=names(pre.prfactor),pre.bag,target.bag,pre.pr,target.pr,probability.factor=pre.prfactor,probability.jump=pre.prraw,frequency.jump=pre.fraw,frequency.jump.relative=pre.fraw/target.bag)
      pre.top.voc.change <- subset(pre.voc.change,target.bag>10&target.pr>0.0001 & (probability.factor + probability.jump*5000>2))
      pre.top20.rchange <- pre.top.voc.change[order(pre.top.voc.change$probability.factor*1000*pre.top.voc.change$target.pr,decreasing=TRUE),][1:nover,]
      pre.top20.fchange <- pre.top.voc.change[order(pre.top.voc.change$frequency.jump*1000*pre.top.voc.change$target.pr,decreasing=TRUE),][1:nover,]

      ### Post-to-target
      post.prfactor     <- (target.pr/post.pr)
      post.prraw    <- (target.pr-post.pr)
      post.fraw         <- (target.bag-post.bag)
      post.voc.change <- data.frame(feature=names(post.prfactor),post.bag,target.bag,post.pr,target.pr,
                  probability.factor=post.prfactor,probability.jump=post.prraw,
                  frequency.jump=post.fraw,frequency.jump.relative=post.fraw/target.bag)
      post.top.voc.change <- subset(post.voc.change,target.bag>10&target.pr>0.0001 & (probability.factor>2 & probability.jump>0.0001))
      post.top20.rchange <- post.top.voc.change[order(post.top.voc.change$probability.factor,decreasing=TRUE),][1:nover,]
      post.top20.fchange <- post.top.voc.change[order(post.top.voc.change$frequency.jump,decreasing=TRUE),][1:nover,]

      wordlist <- paste(    unique(c(as.character(pre.top20.rchange$feature),
                as.character(post.top20.rchange$feature),
                as.character(pre.top20.fchange$feature),
                as.character(post.top20.fchange$feature))))

      topic.textlist <- data.frame(id=wide.STM[wide.STM$id%in%target.bag.dtm@docvars$id,"id"],prob=wide.STM[wide.STM$id%in%target.bag.dtm@docvars$id,topic.eq[as.numeric(topic),"topicX"]],headline=wide.STM[wide.STM$id%in%target.bag.dtm@docvars$id,"headline"],date=wide.STM[wide.STM$id%in%target.bag.dtm@docvars$id,"Time"])
      # order(topic.textlist$prob)

      return(list(startdate=startdate,enddate=enddate,
            topic=topic,area=area,
            wordlist=wordlist,
            texts=topic.textlist[order(topic.textlist$prob,decreasing=TRUE),],
            top20.relative.change.pre.to.target=pre.top20.rchange,
            top20.relative.change.post.to.target=post.top20.rchange,
            top20.absolute.change.pre.to.target=pre.top20.fchange,
            top20.absolute.change.post.to.target=post.top20.fchange))
    }

  compareVocabulary2 <- function(data=data,pre=pre,post=post,start=start,end=end,nover=nover,nunder=nunder,topic=topic,area=area)
    {
    startdate <- as.POSIXct(start*60*60*24,origin="1784-12-31")
    enddate   <- as.POSIXct(end*60*60*24,origin="1784-12-31")
    topicX <- to.lab[to.lab$topic250==topic,"topic"]
    # pre.bag.dtm   <- dfm_subset(data,days>(start-pre) & days<start) # without topic requirement
    # target.bag.dtm    <- dfm_subset(data,days>(start) & days<end)   # without topic requirement
    # post.bag.dtm  <- dfm_subset(data,days>(end) & days<(end+post))  # without topic requirement
    pre.bag.dtm     <- dfm_subset(data,days>(start-pre) & days<start & topics_doc[,topicX]>0.005)
    target.bag.dtm  <- dfm_subset(data,days>(start) & days<end  & topics_doc[,topicX]>0.005)
    post.bag.dtm    <- dfm_subset(data,days>(end) & days<(end+post) & topics_doc[,topicX]>0.005)
    full.bag    <- colSums(pre.bag.dtm)+colSums(target.bag.dtm)+colSums(post.bag.dtm)
    pre.bag     <- ifelse(colSums(pre.bag.dtm)[full.bag>0]>0,colSums(pre.bag.dtm)[full.bag>0],1)
    target.bag  <- ifelse(colSums(target.bag.dtm)[full.bag>0]>0,colSums(target.bag.dtm)[full.bag>0],1)
    post.bag    <- ifelse(colSums(post.bag.dtm)[full.bag>0]>0,colSums(post.bag.dtm)[full.bag>0],1)
    pre.pr      <- pre.bag/sum(pre.bag,na.rm=TRUE)
    target.pr   <- target.bag/sum(target.bag,na.rm=TRUE)
    post.pr     <- post.bag/sum(post.bag,na.rm=TRUE)

    ### Pre-to-target
    pre.prfactor    <- (target.pr/pre.pr)
    pre.prraw       <- (target.pr-pre.pr)
    pre.fraw        <- (target.bag-pre.bag)
    pre.voc.change <- data.frame(feature=names(pre.prfactor),pre.bag,target.bag,pre.pr,target.pr,probability.factor=pre.prfactor,probability.jump=pre.prraw,frequency.jump=pre.fraw,frequency.jump.relative=pre.fraw/target.bag)
    pre.top.voc.change <- subset(pre.voc.change,target.bag>10&target.pr>0.0001 & (probability.factor + probability.jump*5000>2))
    pre.top20.rchange <- pre.top.voc.change[order(pre.top.voc.change$probability.factor*1000*pre.top.voc.change$target.pr,decreasing=TRUE),][1:nover,]
    pre.top20.fchange <- pre.top.voc.change[order(pre.top.voc.change$frequency.jump*1000*pre.top.voc.change$target.pr,decreasing=TRUE),][1:nover,]

    ### Post-to-target
    post.prfactor   <- (target.pr/post.pr)
    post.prraw      <- (target.pr-post.pr)
    post.fraw           <- (target.bag-post.bag)
    post.voc.change <- data.frame(feature=names(post.prfactor),post.bag,target.bag,post.pr,target.pr,
                probability.factor=post.prfactor,probability.jump=post.prraw,
                frequency.jump=post.fraw,frequency.jump.relative=post.fraw/target.bag)
    post.top.voc.change <- subset(post.voc.change,target.bag>10&target.pr>0.0001 & (probability.factor>2 & probability.jump>0.0001))
    post.top20.rchange <- post.top.voc.change[order(post.top.voc.change$probability.factor,decreasing=TRUE),][1:nover,]
    post.top20.fchange <- post.top.voc.change[order(post.top.voc.change$frequency.jump,decreasing=TRUE),][1:nover,]

    wordlist <- paste(  unique(c(as.character(pre.top20.rchange$feature),
              as.character(post.top20.rchange$feature),
              as.character(pre.top20.fchange$feature),
              as.character(post.top20.fchange$feature))))

    topic.textlist <- data.frame(id=topics_doc[topics_doc$id%in%target.bag.dtm@docvars$id,"id"],prob=topics_doc[topics_doc$id%in%target.bag.dtm@docvars$id,to.lab[which(to.lab$topic250==topic),"topic"]],headline=topics_doc[topics_doc$id%in%target.bag.dtm@docvars$id,"headline"],date=topics_doc[topics_doc$id%in%target.bag.dtm@docvars$id,"Time"])
    topic.textlist <- topic.textlist[order(topic.textlist$prob,decreasing=TRUE),]
    topic.textlist <- if(dim(topic.textlist)[1]>40){topic.textlist[1:40,]} else {topic.textlist}
    # order(topic.textlist$prob)

  return(list(startdate=startdate,enddate=enddate,
        topic=topic,area=area,
        wordlist=wordlist,
        texts=topic.textlist[order(topic.textlist$prob,decreasing=TRUE),],
        top20.relative.change.pre.to.target=pre.top20.rchange,
        top20.relative.change.post.to.target=post.top20.rchange,
        top20.absolute.change.pre.to.target=pre.top20.fchange,
        top20.absolute.change.post.to.target=post.top20.fchange))
  }

  compareVocabulary <- function(data=data,pre=pre,post=post,start=start,end=end,nover=nover,nunder=nunder,topic=topic,area=area)
    {
    startdate <- as.POSIXct(start*60*60*24,origin="1784-12-31")
    enddate   <- as.POSIXct(end*60*60*24,origin="1784-12-31")
    topicX <- to.lab[to.lab$topic250==topic,"topic"]
    # pre.bag.dtm   <- dfm_subset(data,days>(start-pre) & days<start) # without topic requirement
    # target.bag.dtm    <- dfm_subset(data,days>(start) & days<end)   # without topic requirement
    # post.bag.dtm  <- dfm_subset(data,days>(end) & days<(end+post))  # without topic requirement
    pre.bag.dtm     <- dfm_subset(data,days>(start-pre) & days<start & topics_doc[,topicX]>0.005)
    target.bag.dtm  <- dfm_subset(data,days>(start) & days<end  & topics_doc[,topicX]>0.005)
    post.bag.dtm    <- dfm_subset(data,days>(end) & days<(end+post) & topics_doc[,topicX]>0.005)
    full.bag    <- colSums(pre.bag.dtm)+colSums(target.bag.dtm)+colSums(post.bag.dtm)
    pre.bag     <- ifelse(colSums(pre.bag.dtm)[full.bag>0]>0,colSums(pre.bag.dtm)[full.bag>0],1)
    target.bag  <- ifelse(colSums(target.bag.dtm)[full.bag>0]>0,colSums(target.bag.dtm)[full.bag>0],1)
    post.bag    <- ifelse(colSums(post.bag.dtm)[full.bag>0]>0,colSums(post.bag.dtm)[full.bag>0],1)
    pre.pr      <- pre.bag/sum(pre.bag,na.rm=TRUE)
    target.pr   <- target.bag/sum(target.bag,na.rm=TRUE)
    post.pr     <- post.bag/sum(post.bag,na.rm=TRUE)

    ### Pre-to-target
    pre.prfactor    <- (target.pr/pre.pr)
    pre.prraw       <- (target.pr-pre.pr)
    pre.fraw        <- (target.bag-pre.bag)
    pre.voc.change <- data.frame(feature=names(pre.prfactor),pre.bag,target.bag,pre.pr,target.pr,probability.factor=pre.prfactor,probability.jump=pre.prraw,frequency.jump=pre.fraw,frequency.jump.relative=pre.fraw/target.bag)
    pre.top.voc.change <- subset(pre.voc.change,target.bag>10&target.pr>0.0001 & (probability.factor + probability.jump*5000>2))
    pre.top20.rchange <- pre.top.voc.change[order(pre.top.voc.change$probability.factor*1000*pre.top.voc.change$target.pr,decreasing=TRUE),][1:nover,]
    pre.top20.fchange <- pre.top.voc.change[order(pre.top.voc.change$frequency.jump*1000*pre.top.voc.change$target.pr,decreasing=TRUE),][1:nover,]

    ### Post-to-target
    post.prfactor   <- (target.pr/post.pr)
    post.prraw      <- (target.pr-post.pr)
    post.fraw           <- (target.bag-post.bag)
    post.voc.change <- data.frame(feature=names(post.prfactor),post.bag,target.bag,post.pr,target.pr,
                probability.factor=post.prfactor,probability.jump=post.prraw,
                frequency.jump=post.fraw,frequency.jump.relative=post.fraw/target.bag)
    post.top.voc.change <- subset(post.voc.change,target.bag>10&target.pr>0.0001 & (probability.factor>2 & probability.jump>0.0001))
    post.top20.rchange <- post.top.voc.change[order(post.top.voc.change$probability.factor,decreasing=TRUE),][1:nover,]
    post.top20.fchange <- post.top.voc.change[order(post.top.voc.change$frequency.jump,decreasing=TRUE),][1:nover,]

    wordlist <- paste(  unique(c(as.character(pre.top20.rchange$feature),
              as.character(post.top20.rchange$feature),
              as.character(pre.top20.fchange$feature),
              as.character(post.top20.fchange$feature))))

    topic.textlist <- data.frame(id=topics_doc[topics_doc$id%in%target.bag.dtm@docvars$id,"id"],prob=topics_doc[topics_doc$id%in%target.bag.dtm@docvars$id,to.lab[which(to.lab$topic250==topic),"topic"]],headline=topics_doc[topics_doc$id%in%target.bag.dtm@docvars$id,"headline"],date=topics_doc[topics_doc$id%in%target.bag.dtm@docvars$id,"Time"])
    topic.textlist <- topic.textlist[order(topic.textlist$prob,decreasing=TRUE),]
    topic.textlist <- if(dim(topic.textlist)[1]>40){topic.textlist[1:40,]} else {topic.textlist}
    # order(topic.textlist$prob)

  return(list(startdate=startdate,enddate=enddate,
        topic=topic,area=area,
        wordlist=wordlist,
        texts=topic.textlist[order(topic.textlist$prob,decreasing=TRUE),],
        top20.relative.change.pre.to.target=pre.top20.rchange,
        top20.relative.change.post.to.target=post.top20.rchange,
        top20.absolute.change.pre.to.target=pre.top20.fchange,
        top20.absolute.change.post.to.target=post.top20.fchange))
  }

# Graphically zooms in to the time the crisis news wave occurred, with different temporal resolutions

  nw_zoom <- function(ts,nw,nw.df,tframe){
    tsx <- (list.dma[[which(to.lab$topic250==nw$topic250[1])]])
    tsx$date <- as.Date(tsx$day,origin="1784-12-31")
    tsxl <- melt(tsx,measure.vars=c("DMA30", "DMA90", "DMA180", "DMA365", "DMA730", "DMA1825"))
    tf1.start <- min(nw.df$start.date)
    tf1.end <- max(nw.df$end.date)
    tf2.start <- as.Date(nw$start-tframe[4],origin="1784-12-31")
    tf2.end <- as.Date(nw$end+tframe[4],origin="1784-12-31")
    tf3.start <- as.Date(nw$start-tframe[3],origin="1784-12-31")
    tf3.end <- as.Date(nw$end+tframe[3],origin="1784-12-31")
    tf4.start <- as.Date(nw$start-tframe[2],origin="1784-12-31")
    tf4.end <- as.Date(nw$end+tframe[2],origin="1784-12-31")
    tf5.start <- as.Date(nw$start-tframe[1],origin="1784-12-31")
    tf5.end <- as.Date(nw$end+tframe[1],origin="1784-12-31")
    tf6.start <- as.Date(nw$start-100,origin="1784-12-31")
    tf6.end <- as.Date(nw$end+100,origin="1784-12-31")
    gg1 <- ggplot(tsxl)+geom_line(aes(x=date,y=value,group=variable,color=variable))+geom_rect(data=nw,xmin=nw$start.date,xmax=nw$end.date,ymin=0,ymax=0.20,fill="red",alpha=0.5)+geom_rect(data=nw.df,xmin=tf2.start,xmax=tf2.end,ymin=0,ymax=0.10,fill=NA,color="red",linetype="dashed",size=1)+theme_bluewhite()+xlab("Time")+ylab("News stories per day")
    gg2 <- ggplot(subset(tsxl,date<tf2.end & date>tf2.start))+geom_line(aes(x=date,y=value,group=variable,color=variable))+geom_rect(data=nw,xmin=nw$start.date,xmax=nw$end.date,ymin=0,ymax=0.20,fill="red",alpha=0.5)+geom_rect(data=nw.df,xmin=tf3.start,xmax=tf3.end,ymin=0,ymax=0.10,fill=NA,color="red",linetype="dashed",size=1)+theme_bluewhite()+xlab("Time")+ylab("News stories per day")
    gg3 <- ggplot(subset(tsxl,date<tf3.end & date>tf3.start))+geom_line(aes(x=date,y=value,group=variable,color=variable))+geom_rect(data=nw,xmin=nw$start.date,xmax=nw$end.date,ymin=0,ymax=0.20,fill="red",alpha=0.5)+geom_rect(data=nw.df,xmin=tf4.start,xmax=tf4.end,ymin=0,ymax=0.10,fill=NA,color="red",linetype="dashed",size=1)+theme_bluewhite()+xlab("Time")+ylab("News stories per day")
    gg4 <- ggplot(subset(tsxl,date<tf4.end & date>tf4.start))+geom_line(aes(x=date,y=value,group=variable,color=variable))+geom_rect(data=nw,xmin=nw$start.date,xmax=nw$end.date,ymin=0,ymax=0.20,fill="red",alpha=0.5)+geom_rect(data=nw.df,xmin=tf5.start,xmax=tf5.end,ymin=0,ymax=0.10,fill=NA,color="red",linetype="dashed",size=1)+theme_bluewhite()+xlab("Time")+ylab("News stories per day")
    gg5 <- ggplot(subset(tsxl,date<tf5.end & date>tf5.start))+geom_line(aes(x=date,y=value,group=variable,color=variable))+geom_rect(data=nw,xmin=nw$start.date,xmax=nw$end.date,ymin=0,ymax=0.20,fill="red",alpha=0.5)+geom_rect(data=nw.df,xmin=tf6.start,xmax=tf6.end,ymin=0,ymax=0.10,fill=NA,color="red",linetype="dashed",size=1)+theme_bluewhite()+xlab("Time")+ylab("News stories per day")
    gg6 <- ggplot(subset(tsxl,date<tf6.end & date>tf6.start))+geom_line(aes(x=date,y=value,group=variable,color=variable))+geom_rect(data=nw,xmin=nw$start.date,xmax=nw$end.date,ymin=0,ymax=0.20,fill="red",alpha=0.5)+theme_bluewhite()+xlab("Time")+ylab("News stories per day") 
    gg_all <- grid.arrange(gg1,gg2,gg3,gg4,gg5,gg6,ncol=1)
    return(gg_all)
    }

  validate_nw <- function(x.1,x.2,x.3,event_id){
    x1 <- x.1[[event_id]]
    x2 <- x.2[[event_id]]
    x3 <- x.3[[event_id]]
    head.t <- tokens(x1$texts$headline,remove_punct=TRUE)
    head.r <- tokens_remove(x=head.t,sw)
    head.u <- as.character(head.r)
    vali <- list(
      "From: ... Until ..."= paste(as.Date(x1$startdate),as.Date(x1$enddate)),
      "Spike keywords"=paste0(x1$wordlist),
      "Spike keywords (words occur in at least 100 documents)"=paste0(x2$wordlist),
      "Spike keywords (words occur in at least 1000 documents)"=paste0(x3$wordlist),
      "Frequent words in headlines"=paste0(names(table(head.u))[order(table(head.u),decreasing=TRUE)][1:20]),
      "Topic signifiers"=paste0(x1$topic),
      "Document IDs"=paste0(x1$texts$id),
      "Headlines"=paste0(x1$texts$headline[1:20]))
    return(vali)
  }


  wavecounter7 <- function(x){
    y <- x$o7id
    z <- max(y,na.rm=TRUE)
    return(z)
    }

  wavecounter <- function(x){
    y <- x$o30id
    z <- max(y,na.rm=TRUE)
    return(z)
    }

## Named Entity Recognition

### Get top 100 hits of named entities

  first100names <- function(x){
    size <- length(x)
    orgnames <- names(x)[1:min(100,size)]
    return(orgnames)
    }

### Get phrases around mentions of an entity (keywords in context, KWIC, collocations)

  surround_words2 <- function(pattern,texts)
    {
      kwic_10 <- (kwic(x=texts,pattern=pattern,window=10,valuetype="fixed",case_insensitive=TRUE))
      wordbag <- paste(unlist(kwic_10$pre),unlist(kwic_10$post))
      return(wordbag)
    }

  surround_words <- function(pattern,texts)
    {
      kwic_10 <- (kwic(x=texts,pattern=pattern,window=10,valuetype="fixed",case_insensitive=TRUE))
      wordbag <- paste(unlist(kwic_10$pre),unlist(kwic_10$keyword),unlist(kwic_10$post))
      surround <- paste(wordbag, collapse=" ")
      return(surround)
    }

### Grab relevant Wikipedia articles

  get_knowledge2 <- function(x)
    {
      temp <- rvest::read_html(x)
      entity <- stringr::str_extract(x,"\\=([:alnum:]|\\+)*\\&")
      entity <- stringr::str_remove(entity,"\\=")
      entity <- stringr::str_remove(entity,"\\&")
      srp <- html_elements(temp,"div.mw-search-result-heading")
      pge <- stringr::str_extract(srp[1],pattern="wiki.([:alnum:]|_|\\.)*")
      headmatch <- stringdist(str_remove(pge[1],"wiki."),entities[1])
      dlpage <- rvest::read_html(paste0("http://en.wikipedia.org/",pge))
      content <- paste(html_text(html_elements(dlpage,"p")),collapse="")
      txt <- stringr::str_to_lower(content)
      ent <- unlist(stringr::str_split(entity,"\\+"))
      textlength <- ntoken(txt)
      textmatch <- unlist(lapply(txt,stringr::str_count,ent))
      matchrate <- 1000*textmatch/textlength
      avg.matchrate <- ifelse(length(matchrate)==1,matchrate,mean(matchrate,na.rm=TRUE))
      paste.matchrate <- paste(round(matchrate,2),collapse="///")
      know <- data.frame(entity=entity,page=pge,content=content,avg.matchrate=avg.matchrate,matchrate=paste.matchrate,nwords=textlength)
      return(know)
    }

  get_knowledge <- function(x)
    {
      temp <- rvest::read_html(x)
      entity <- stringr::str_extract(x,"\\=[[:alnum:]\\%\\+\\_éèáàúùêûîíìóòúùûôæøåäöü]*\\&")
      entity <- stringr::str_remove(entity,"\\=")
      entity <- stringr::str_remove(entity,"\\&")
      srp <- html_elements(temp,"div.mw-search-result-heading")
      pge <- stringr::str_extract(as.character(srp[1]),pattern="wiki.[[:alnum:][\\_][\\-][\\.][\\%][\\,][\\?][\\!][\\(][\\)][\\\\][\\/][\\:]éèáàúùêûîíìóòúùûôæøåäöü]*")
      if(length(pge)>0)
        {
          headmatch <- stringdist(str_to_lower(str_remove(pge[1],"wiki.")),entity)
          dlpage <- rvest::read_html(paste0("http://en.wikipedia.org/",pge))
          content <- paste(html_text(html_elements(dlpage,"p")),collapse="")
          txt <- stringr::str_to_lower(content)
          ent <- unlist(stringr::str_split(entity,"\\+"))
          textlength <- ntoken(txt)
          textmatch <- unlist(lapply(txt,stringr::str_count,str_replace_all(ent,"\\_", " ")))
          matchrate <- 1000*textmatch/textlength
          avg.matchrate <- ifelse(length(matchrate)==1,matchrate,mean(matchrate,na.rm=TRUE))
          paste.matchrate <- paste(round(matchrate,2),collapse="///")
          know <- data.frame(entity=entity,page=pge,content=content,avg.matchrate=avg.matchrate,matchrate=paste.matchrate,nwords=textlength,closeness=headmatch,size=nchar(entity))
        }
      if(length(pge)==0)
        {
          know <- data.frame(entity=entity,page=NA,content=NA,avg.matchrate=NA,matchrate=NA,nwords=NA,closeness=NA,size=nchar(entity))
        }
      return(know)
    }

### Sample entities 1

  sampler <-function(x,size){
      asize <- min(size,length(x))
      ifelse(length(x)>0,{result <- try(sample(x=x,size=asize,replace=FALSE,prob=NULL))},{result <- "No_entities_to_draw_for_this_year"})
      return(result)
    }

### Sample entities 2

  rsel <- function(x,n)
    {
      population_size <- length(x)
      sample(x,n)
    }

# Analyses and graphical display

### Count number of unique cases

  uniq_count <- function(x)
  {
    return(length(unique(x)))
  }

### Count occurrences of y in x

  sumy <- function(x,y)
    {
      mx <- as.character(x)
      isx <- str_count(mx,y)
      sx <- sum(isx,na.rm=T)
      return(sx)
    }

### Extract coefficients from GLM models for decades

  coef_extr <- function(x,y)
    {
      co <- coef(x)
      con <- names(coef(x))
      const <- co[con==y]
      decad <- c(0,co[str_detect(con,"fdecade[[:digit:]]{4,4}:.*r")])
      est <- const+decad
      return(est)
    }


### UNFINISHED estimate calculation era effects GLM

  calc.est <- function(x,y,df.coef)
    {
      xcoef <- df.coef[,x]
      ycoef <- df.coef[,y]
    }

### Bluish graphics template

  theme_bluewhite <- function (base_size = 11, base_family = "Open Sans") {
      theme_bw() %+replace% 
      theme(
        panel.grid.major  = element_line(color = "lightskyblue2",linetype="solid",size=.5),
        panel.grid.minor  = element_line(color = "lightskyblue3",linetype="dotted",size=.33),
        panel.background = element_rect(fill = "aliceblue"),
        panel.border = element_rect(color = "lightskyblue3", fill = NA),
    plot.background= element_rect(fill="lightskyblue1"),
        axis.line = element_line(color = "lightskyblue3"),
        axis.ticks = element_line(color = "lightskyblue3"),
        axis.text = element_text(color = "black"),
    axis.text.x = element_text(angle=45,hjust=1,vjust=1),
    axis.title = element_text(face="bold"),
    strip.background = element_rect(color="gray50",fill="steelblue"),
    strip.text = element_text(color="white",face="bold")
        )
  }

### Yellowish graphics template

  theme_soft <- function (base_size = 11, base_family = "Open Sans") {
      theme_bw() %+replace% 
      theme(
        panel.grid.major  = element_line(color = "#ffffff",linetype="solid",size=0.5),
        panel.grid.minor  = element_line(color = "#ffffff",linetype="dotted",size=.33),
        panel.background = element_rect(fill = "#f6e0b5"),
        panel.border = element_rect(color = "#ffffff", fill = NA, size=1.0),
    plot.background= element_rect(fill="#fff4e6"),
        axis.line = element_line(color = "#ffffff",size=1.0),
        axis.ticks = element_line(color = "#ffffff"),
        axis.text = element_text(color = "#000000"),
    axis.text.x = element_text(angle=45,hjust=1,vjust=1),
    axis.title = element_text(face="bold"),
    strip.background = element_rect(color="#967259",fill="#be9b7b"),
    strip.text = element_text(color="white",face="bold"),
    legend.background = element_rect(fill=alpha("#f6e0b5",.5),colour = "#937342")
        )
  }

2 Load texts

2.1 Load initial text representation (noncrisis corpus)

Code
### Import texts and create data frame from it, with extracted ID numbers

nc <- readtext::readtext("..//..//in//txtdata//tms//nc//*.txt")

nc.rawdf <- data.frame(doc_id = c(nc$doc_id), text = c(nc$text))

nc.rawdf$id <- str_extract(nc.rawdf$doc_id, "[:upper:]{2,5}[:digit:]{5,15}") # extracts the Gale ID number of the articles for later identification of units.

### Import metadata for texts


filelist.meta <- paste0("..//..//in//metadata//tms//nc//", list.files(path = "..//..//in//metadata//tms//nc", pattern = "*.csv"))

metadata_list <- lapply(filelist.meta, read.csv, sep = ",", row.names = 11, header = TRUE)

metadata <- list.rbind(metadata_list)

# metadata <- rbind(
#   read.csv(filelist.meta[[1]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[2]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[3]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[4]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[5]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[6]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[7]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[8]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[9]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[10]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[11]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[12]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[13]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[14]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[15]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[16]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[17]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[18]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[19]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[20]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[21]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[22]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[23]],sep=",", row.names=11,header=TRUE),
#   read.csv(filelist.meta[[24]],sep=",", row.names=11,header=TRUE))

names(metadata) <- c("headline", "type", "genre", "newspaper", "date", "e1", "location", "author", "source", "document_number", "subject", "e2", "e3")
metadata$id <- str_extract(rownames(metadata), pattern = "[:upper:]{2,5}[:digit:]{5,15}")

nc.rawdf$headline <- metadata[match(nc.rawdf$id, metadata$id), "headline"]
nc.rawdf$type <- metadata[match(nc.rawdf$id, metadata$id), "type"]
nc.rawdf$genre <- metadata[match(nc.rawdf$id, metadata$id), "genre"]
nc.rawdf$newspaper <- metadata[match(nc.rawdf$id, metadata$id), "newspaper"]
nc.rawdf$date <- metadata[match(nc.rawdf$id, metadata$id), "date"]
nc.rawdf$author <- metadata[match(nc.rawdf$id, metadata$id), "author"]
nc.rawdf$source <- metadata[match(nc.rawdf$id, metadata$id), "source"]

monthnames <- c(
  "January", "February", "March", "April", "May", "June",
  "July", "August", "September", "October", "November", "December"
)

nc.rawdf$year <- str_extract(nc.rawdf$date, "[:digit:]{4}")
# Extracts the year from the date metadata.
nc.rawdf$day <- str_extract(nc.rawdf$date, "[:digit:]{1,2}")
# Extracts the day of the month from the date metadata.
nc.rawdf$month <- str_extract(nc.rawdf$date, "[:alpha:]{3,9}")
# Extracts the month from the data metadata.
nc.rawdf$Month <- car::Recode(nc.rawdf$month, "
                                            'January'='01';'February'='02';'March'='03';
                                            'April'='04';'May'='05';'June'='06';'July'='07';
                                            'August'='08';'September'='09';'October'='10';
                                            'November'='11';'December'='12'") # Month numbers are changed to month names. This prevents mixing days and months when switching between dd/mm/yyyy and mm/dd/yyyy formats.

nc.rawdf$time <- with(nc.rawdf, paste0(year, "-", month, "-", day))
# Full representation of date as numbers.
nc.rawdf$Time <- with(nc.rawdf, paste0(year, "-", Month, "-", day))
# Full representation of date, with month names.

### Basic cleaning procedures

nc.r <- nc.rawdf
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = " - ", replacement = "")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "-", replacement = "")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^ofth$", replacement = "often")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^ar[bho]$", replacement = "are")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^th[bho]$", replacement = "the")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^on[bho]$", replacement = "one")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^ther[bho]$", replacement = "there")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^hav[bho]$", replacement = "have")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^her[bho]$", replacement = "here")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^[jJ]an$", replacement = "January")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^[fF]eb$", replacement = "February")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^[mM]ar$", replacement = "March")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^[aA]pr$", replacement = "April")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^[mM]ay$", replacement = "May")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^[jJ]un$", replacement = "June")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^[jJ]ul$", replacement = "July")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^[aA]ug$", replacement = "August")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^[sS]ep$", replacement = "September")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^[oO]ct$", replacement = "October")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^[nN]ov$", replacement = "November")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "^[dD]ec$", replacement = "December")
nc.r$text <- str_replace_all(string = nc.rawdf$text, pattern = "'", replacement = "")

sw <- c(
  "oclock", "minut", "yard", "morn", "half-past", "dont", "don't", "say", "did",
  "didn't", "get", "want", "just", "think", "cant", "cannot", "theyr", "wasnt", "wouldnt",
  "wasn't", "wouldn't", "havent", "haven't", "shouldn't", "should not"
)

# save(nc.rawdf,file="nc_text.RData")

# save(nc.r,file="nc_text2.RData")

nc.text <- nc.rawdf[!duplicated(nc.rawdf$id), ]

nc.corpus <- corpus(nc.text)

setwd(".//crisis_non")

endict <- read.csv(file = "..//..//ENGVOC_lo.txt")
endict <- as.character(endict[, 1])
# ensure that the vocabulary list is represented as character vector.

sw <- c(
  "oclock", "minut", "yard", "morn", "half-past", "dont",
  "don't", "say", "did", "didn't", "get", "want", "just", "think",
  "cant", "cannot", "theyr", "wasnt", "wouldnt", "wasn't", "wouldn't",
  "havent", "haven't", "shouldn't", "ing", "tion", "con", "com",
  "pro", "ã‚â£", "ã‚â£m", "â£m", "•", "â–º", "“the", "britain’",
  "¬", "â", "â€", "~", "ã", "b", "c", "d", "e", "f", "g", "h", "i",
  "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w",
  "x", "y", "B", "C", "D", "E", "F", "G", "H", "J", "K", "L", "M", "N",
  "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "1", "2",
  "3", "4", "5", "6", "7", "8", "9", "10", "ll", "^", "th", "ot", "15",
  "20", "te", ">", "id", "il", "´", "`", "la", "ii", "en", "al", "tbe",
  "re", "lt", "li", "ar", "fo", "thie", "°", "aud", "le", "es", "ss",
  "od", "bo", "ir", "tho"
)

# stopwords that are not meaningful for the topic models and
# should be removed from the Document term matrix for the Topic Models.
# always remove as late as possible.


# To avoid deleting person names, we create a compilation of first names and surnames. The US census data are perfect because they consider names from a wide range of regions and languages.
person.names <- as.matrix(babynames[, 3])
person_names <- tolower(unique(person.names))
surnames <- tolower(surnames2010[, 1])

# Tokens objects with and without punctuation
nc.tokens.punct <- tokens(nc.corpus, remove_punct = FALSE)
nc.tokens <- tokens(nc.corpus, remove_punct = TRUE)

# Quanteda DTM
nc.dtm <- dfm(tokens_remove(nc.tokens, pattern = c(stopwords("en"), sw)))
docvars(nc.dtm)$Corpus <- "Noncrisis"

2.2 Load initial text representation (crisis corpus)

Code
###########################################
# IV) READ TEXT FILE AND CREATE BASIC TEXT DATA

### Loading text files

## Factiva

setwd(".//texts//2020")
tx2020 <- readtext::readtext("*.txt") # reads the texts from all .txt files in the current working directory
setwd(".//texts//2019")
tx2019 <- readtext::readtext("*.txt") # reads the texts from all .txt files in the current working directory
setwd(".//texts//2018")
tx2018 <- readtext::readtext("*.txt") # reads the texts from all .txt files in the current working directory
setwd(".//texts//2017")
tx2017 <- readtext::readtext("*.txt") # reads the texts from all .txt files in the current working directory
setwd(".//texts//2016")
tx2016 <- readtext::readtext("*.txt") # reads the texts from all .txt files in the current working directory
setwd(".//texts//2015")
tx2015 <- readtext::readtext("*.txt") # reads the texts from all .txt files in the current working directory

t2020 <- c("Factiva 2020")
id2020 <- c("Factiva 2020")
for (i in 1:dim(tx2020)[1])
{
  temp <- unlist(str_split(string = tx2020[i, "text"], pattern = "T[:alnum:]{24}"))
  temp2 <- unlist(str_extract_all(tx2020[i, "text"], pattern = "T[:alnum:]{24}"))
  t2020 <- c(t2020, temp[1:length(temp) - 1])
  id2020 <- c(id2020, temp2)
}

t2019 <- c("Factiva 2019")
id2019 <- c("Factiva 2019")
for (i in 1:dim(tx2019)[1])
{
  temp <- unlist(str_split(string = tx2019[i, "text"], pattern = "T[:alnum:]{24}"))
  temp2 <- unlist(str_extract_all(tx2019[i, "text"], pattern = "T[:alnum:]{24}"))
  t2019 <- c(t2019, temp[1:length(temp) - 1])
  id2019 <- c(id2019, temp2)
}

t2018 <- c("Factiva 2018")
id2018 <- c("Factiva 2018")
for (i in 1:dim(tx2018)[1])
{
  temp <- unlist(str_split(string = tx2018[i, "text"], pattern = "T[:alnum:]{24}"))
  temp2 <- unlist(str_extract_all(tx2018[i, "text"], pattern = "T[:alnum:]{24}"))
  t2018 <- c(t2018, temp[1:length(temp) - 1])
  id2018 <- c(id2018, temp2)
}

t2017 <- c("Factiva 2017")
id2017 <- c("Factiva 2017")
for (i in 1:dim(tx2017)[1])
{
  temp <- unlist(str_split(string = tx2017[i, "text"], pattern = "T[:alnum:]{24}"))
  temp2 <- unlist(str_extract_all(tx2017[i, "text"], pattern = "T[:alnum:]{24}"))
  t2017 <- c(t2017, temp[1:length(temp) - 1])
  id2017 <- c(id2017, temp2)
}

t2016 <- c("Factiva 2016")
id2016 <- c("Factiva 2016")
for (i in 1:dim(tx2016)[1])
{
  temp <- unlist(str_split(string = tx2016[i, "text"], pattern = "T[:alnum:]{24}"))
  temp2 <- unlist(str_extract_all(tx2016[i, "text"], pattern = "T[:alnum:]{24}"))
  t2016 <- c(t2016, temp[1:length(temp) - 1])
  id2016 <- c(id2016, temp2)
}

t2015 <- c("Factiva 2015")
id2015 <- c("Factiva 2015")
for (i in 1:dim(tx2015)[1])
{
  temp <- unlist(str_split(string = tx2015[i, "text"], pattern = "T[:alnum:]{24}"))
  temp2 <- unlist(str_extract_all(tx2015[i, "text"], pattern = "T[:alnum:]{24}"))
  t2015 <- c(t2015, temp[1:length(temp) - 1])
  id2015 <- c(id2015, temp2)
}

date2015 <- str_extract(string = t2015, pattern = "[:digit:]{1,2}\\s.{3,9}\\s[:digit:]{4}")
date2016 <- str_extract(string = t2016, pattern = "[:digit:]{1,2}\\s.{3,9}\\s[:digit:]{4}")
date2017 <- str_extract(string = t2017, pattern = "[:digit:]{1,2}\\s.{3,9}\\s[:digit:]{4}")
date2018 <- str_extract(string = t2018, pattern = "[:digit:]{1,2}\\s.{3,9}\\s[:digit:]{4}")
date2019 <- str_extract(string = t2019, pattern = "[:digit:]{1,2}\\s.{3,9}\\s[:digit:]{4}")
date2020 <- str_extract(string = t2020, pattern = "[:digit:]{1,2}\\s.{3,9}\\s[:digit:]{4}")

author2015 <- str_extract(string = t2015, pattern = "\\n[:alpha:]{2,20}\\s{1}[:alpha:]{2,30}\\n")
author2016 <- str_extract(string = t2016, pattern = "\\n[:alpha:]{2,20}\\s{1}[:alpha:]{2,30}\\n")
author2017 <- str_extract(string = t2017, pattern = "\\n[:alpha:]{2,20}\\s{1}[:alpha:]{2,30}\\n")
author2018 <- str_extract(string = t2018, pattern = "\\n[:alpha:]{2,20}\\s{1}[:alpha:]{2,30}\\n")
author2019 <- str_extract(string = t2019, pattern = "\\n[:alpha:]{2,20}\\s{1}[:alpha:]{2,30}\\n")
author2020 <- str_extract(string = t2020, pattern = "\\n[:alpha:]{2,20}\\s{1}[:alpha:]{2,30}\\n")

head2015 <- str_extract(string = t2015, pattern = "\\n.*\\n.*\\n[:digit:]{1,4}\\sWörter")
head2016 <- str_extract(string = t2016, pattern = "\\n.*\\n.*\\n[:digit:]{1,4}\\sWörter")
head2017 <- str_extract(string = t2017, pattern = "\\n.*\\n.*\\n[:digit:]{1,4}\\sWörter")
head2018 <- str_extract(string = t2018, pattern = "\\n.*\\n.*\\n[:digit:]{1,4}\\sWörter")
head2019 <- str_extract(string = t2019, pattern = "\\n.*\\n.*\\n[:digit:]{1,4}\\sWörter")
head2020 <- str_extract(string = t2020, pattern = "\\n.*\\n.*\\n[:digit:]{1,4}\\sWörter")

text2015 <- str_replace(string = t2015, pattern = "\\n\\nNews UK & Ireland Limited\\n\\nDokument", " ")
text2016 <- str_replace(string = t2016, pattern = "\\n\\nNews UK & Ireland Limited\\n\\nDokument", " ")
text2017 <- str_replace(string = t2017, pattern = "\\n\\nNews UK & Ireland Limited\\n\\nDokument", " ")
text2018 <- str_replace(string = t2018, pattern = "\\n\\nNews UK & Ireland Limited\\n\\nDokument", " ")
text2019 <- str_replace(string = t2019, pattern = "\\n\\nNews UK & Ireland Limited\\n\\nDokument", " ")
text2020 <- str_replace(string = t2020, pattern = "\\n\\nNews UK & Ireland Limited\\n\\nDokument", " ")

text2015 <- str_replace(string = text2015, pattern = "(\\n){2}[[\\s\\S]]*2015", " ")
text2016 <- str_replace(string = text2016, pattern = "(\\n){2}[[\\s\\S]]*2016", " ")
text2017 <- str_replace(string = text2017, pattern = "(\\n){2}[[\\s\\S]]*2017", " ")
text2018 <- str_replace(string = text2018, pattern = "(\\n){2}[[\\s\\S]]*2018", " ")
text2019 <- str_replace(string = text2019, pattern = "(\\n){2}[[\\s\\S]]*2019", " ")
text2020 <- str_replace(string = text2020, pattern = "(\\n){2}[[\\s\\S]]*2020", " ")

year2015 <- str_extract(string = date2015, pattern = "[:digit:]{4}")
year2016 <- str_extract(string = date2016, pattern = "[:digit:]{4}")
year2017 <- str_extract(string = date2017, pattern = "[:digit:]{4}")
year2018 <- str_extract(string = date2018, pattern = "[:digit:]{4}")
year2019 <- str_extract(string = date2019, pattern = "[:digit:]{4}")
year2020 <- str_extract(string = date2020, pattern = "[:digit:]{4}")

ftx <- data.frame(
  text = c(text2015, text2016, text2017, text2018, text2019, text2020),
  year = c(year2015, year2016, year2017, year2018, year2019, year2020),
  headline = c(head2015, head2016, head2017, head2018, head2019, head2020),
  author = c(author2015, author2016, author2017, author2018, author2019, author2020),
  date = c(date2015, date2016, date2017, date2018, date2019, date2020),
  id = c(id2015, id2016, id2017, id2018, id2019, id2020)
)

fcp <- corpus(ftx$text)

ftok <- quanteda::tokens(str_to_lower(ftx$text), remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = FALSE, split_hyphens = TRUE, remove_separators = TRUE)

ftok2 <- tokens_replace(ftok, pattern = lexicon::hash_lemmas$token, replacement = lexicon::hash_lemmas$lemma, valuetype = "fixed")

ftok3 <- as.list(matrix(NA, nrow = length(ftok2), ncol = 1))

for (i in 1:length(ftok2))
{
  temp <- length(ftok2[[i]])
  ftok3[[i]] <- ftok2[[i]][1:min(c(temp, 50), na.rm = T)]
}

fcha3 <- unlist(lapply(ftok3, str_c, collapse = " "))

ftok4 <- quanteda::tokens(fcha3)

htok <- quanteda::tokens(str_to_lower(ftx$headline), remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = FALSE, split_hyphens = TRUE, remove_separators = TRUE)

htok2 <- tokens_replace(htok, pattern = lexicon::hash_lemmas$token, replacement = lexicon::hash_lemmas$lemma, valuetype = "fixed")

htok3 <- htok2

myKeys <- dictionary(list(
  crisis = c("*cris*"),
  disaster = c("*disast*"),
  catastrophe = c("*catastroph*"),
  collapse = c("*collaps*"),
  breakdown = c("*breakdown*"),
  recession = c("*recessio*"),
  debacle = c("*debacl*"),
  emergency = c("*emergency*"),
  distress = c("*distress*"),
  plight = c("*pligh*"),
  hardship = c("*hardship*"),
  death = c("*death*", "*dead*"),
  pandemic = c("*pandem*"),
  epidemic = c("*epidem*")
))

keys_ftok <- tokens_lookup(ftok4, dictionary = myKeys, levels = 1)
keys_htok <- tokens_lookup(htok3, dictionary = myKeys, levels = 1)

lapply(keys_ftok, length)

table((unlist(lapply(keys_ftok, length)) + unlist(lapply(keys_htok, length))) > 0)

ftx$key <- (unlist(lapply(keys_ftok, length)) + unlist(lapply(keys_htok, length)))

# ftok <- tokenize_word(ftx$text,split_hyphens=FALSE)

## Gale

setwd(".//texts//gale")

filelist1 <- list.files(pattern = "*.txt")

# filelist gives the names of the raw text files

filelist <- c(filelist1)

textfiles <- readtext::readtext("*.txt") # reads the texts from all .txt files in the current working directory

textfiles$id <- str_extract(textfiles[, 1], "[:upper:]{2}[:digit:]{5,15}") # extracts the Gale ID number of the articles for later identification of units.

filelist3 <- list.files(pattern = "*.csv")
# filelist gives the names of the raw text files
# setwd(".//texts//gale")

metadata <- read.csv(filelist3[[1]], sep = ", ", row.names = 11, header = TRUE)

for (i in 1:(length(filelist3) - 1))
{
  x <- read.csv(filelist3[[i + 1]], sep = ", ", row.names = 11, header = TRUE)
  metadata <- rbind(metadata, x)
}



# Metadata

names(metadata) <- c("headline", "type", "genre", "newspaper", "date", "publisher", "place", "author", "archive") # Gives meaningful variable names for the metadata.
metadata$id <- str_extract(rownames(metadata), "[:upper:]{2}[:digit:]{5,15}") # Creates a variable for the ID, which was previously only represented as rownumbers.

metadatamatcher <- match(textfiles$id, metadata$id) # A vector of which textfiles match which metadata entries, by Gale ID number as identifier.

textfiles$date <- metadata[metadatamatcher, "date"] # assigns metadata to textfile object.
textfiles$newspaper <- metadata[metadatamatcher, "newspaper"] # assigns metadata to textfile object.
textfiles$headline <- metadata[metadatamatcher, "headline"] # assigns metadata to textfile object.
textfiles$genre <- metadata[metadatamatcher, "genre"] # assigns metadata to textfile object.
textfiles$place <- metadata[metadatamatcher, "place"] # assigns metadata to textfile object.
textfiles$publisher <- metadata[metadatamatcher, "publisher"] # assigns metadata to textfile object.
textfiles$author <- metadata[metadatamatcher, "author"] # assigns metadata to textfile object.
textfiles$archive <- metadata[metadatamatcher, "archive"] # assigns metadata to textfile object.
textfiles$type <- metadata[metadatamatcher, "type"] # assigns metadata to textfile object.

metadata$year <- str_extract(textfiles$date, "[:digit:]{4}")

monthnames <- c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")

textfiles$year <- str_extract(textfiles$date, "[:digit:]{4}") # Extracts the year from the date metadata.
textfiles$day <- str_extract(textfiles$date, "[:digit:]{1,2}") # Extracts the day of the month from the date metadata.
textfiles$month <- str_extract(textfiles$date, "[:alpha:]{3,9}") # Extracts the month from the data metadata.
textfiles$Month <- Recode(textfiles$month, "    'January'='01';'February'='02';'March'='03';
                        'April'='04';'May'='05';'June'='06';'July'='07';
                        'August'='08';'September'='09';'October'='10';
                        'November'='11';'December'='12'") # Month numbers are changed to month names. This prevents mixing days and months when switching between dd/mm/yyyy and mm/dd/yyyy formats.

textfiles$time <- with(textfiles, paste0(year, "-", month, "-", day)) # Full representation of date as numbers.
textfiles$Time <- with(textfiles, paste0(year, "-", Month, "-", day)) # Full representation of date, with month names.

tms_tx_u <- data.frame(docvars(cp_red), text = as.character(cp_red))

3 Text preprocessing

3.1 Remove doublettes

Code
###########################################
# IV) DOUBLETTE REMOVAL

hl <- subset(tms_tx, year > 2014)$headline
tx <- (subset(tms_tx, year > 2014)$text)

mseq <- seq(1, 30000, 1000)

ddiag <- data.frame(hl = subset(tms_tx, year > 2014)$headline)
ddiag$hl_length <- nchar(hl)
ddiag$tx_length <- nchar(tx)

hld <- matrix(NA, ncol = length(ddiag$hl), nrow = length(ddiag$hl))
rownames(hld) <- ddiag$hl
colnames(hld) <- ddiag$hl

for (i in 1:length(ddiag$hl)) {
  hld[i, ] <- stringdist(a = ddiag$hl, b = ddiag$hl[i])
  hld[i, i] <- NA
  if (i %in% mseq) {
    print(i)
    flush.console()
  }
}

ddiag$min_strdist <- rowMins(hld, na.rm = TRUE)
ddiag$number_identical <- rowSums(hld == 0, na.rm = TRUE)

pos_dou <- matrix(NA, ncol = max(ddiag$number_identical), nrow = dim(ddiag)[1])
rownames(pos_dou) <- ddiag$hl

sd_dou <- matrix(NA, ncol = max(ddiag$number_identical), nrow = dim(ddiag)[1])
rownames(sd_dou) <- ddiag$hl

for (i in 1:dim(pos_dou)[1]) {
  if (ddiag[i, "min_strdist"] == 0) {
    strdst <- stringdist(a = rownames(pos_dou)[i], b = ddiag$hl[-i])
    zeropos <- which(strdst == 0)
    pos_dou[i, 1:length(zeropos)] <- zeropos
  }
}

for (i in 1:dim(sd_dou)[1]) {
  for (j in 1:dim(sd_dou)[2]) {
    if (is.na(pos_dou[i, j])) {
      sd_dou[i, j] <- ddiag[i, "tx_length"]
    }
    if (!is.na(pos_dou[i, j])) {
      sd_dou[i, j] <- stringdist(tx[i], tx[pos_dou[i, j]])
    }
  }
  if (i %in% mseq) {
    print(i)
    flush.console()
  }
}

ddiag$min_textdist <- rowMins(sd_dou, na.rm = TRUE)
ddiag$min_textdist <- ifelse(ddiag$min_textdist > ddiag$tx_length | ddiag$min_textdist == 9999, ddiag$tx_length, ddiag$min_textdist)
ddiag$hlsim <- (ddiag$hl_length - ddiag$min_strdist) / ddiag$hl_length
ddiag$txsim <- (ddiag$tx_length - ddiag$min_textdist) / ddiag$tx_length

ddiag$article_position <- 1:dim(ddiag)[1]
ddiag$doublette_position <- pos_dou[, 1]

ddiag$kill <- ifelse(ddiag$article_position %in% unique(ddiag$doublette_position), 1, 0)


### Round 2

ddiag2 <- subset(ddiag, kill == 0)

hld2 <- matrix(NA, ncol = length(ddiag2$hl), nrow = length(ddiag2$hl))
rownames(hld2) <- ddiag2$hl
colnames(hld2) <- ddiag2$hl

for (i in 1:length(ddiag2$hl)) {
  hld2[i, ] <- stringdist(a = ddiag2$hl, b = ddiag2$hl[i])
  hld2[i, i] <- NA
  if (i %in% mseq) {
    print(i)
    flush.console()
  }
}

ddiag2$min_strdist <- rowMins(hld2, na.rm = TRUE)
ddiag2$number_identical <- rowSums(hld2 == 0, na.rm = TRUE)

pos_dou2 <- matrix(NA, ncol = max(ddiag2$number_identical), nrow = dim(ddiag2)[1])
rownames(pos_dou2) <- ddiag2$hl

sd_dou2 <- matrix(NA, ncol = max(ddiag2$number_identical), nrow = dim(ddiag2)[1])
rownames(sd_dou2) <- ddiag2$hl

for (i in 1:dim(pos_dou2)[1]) {
  if (ddiag2[i, "min_strdist"] == 0) {
    strdst <- stringdist(a = rownames(pos_dou2)[i], b = ddiag2$hl[-i])
    zeropos <- which(strdst == 0)
    pos_dou2[i, 1:length(zeropos)] <- zeropos
  }
}

for (i in 1:dim(sd_dou2)[1]) {
  for (j in 1:dim(sd_dou2)[2]) {
    if (is.na(pos_dou2[i, j])) {
      sd_dou2[i, j] <- ddiag2[i, "tx_length"]
    }
    if (!is.na(pos_dou2[i, j])) {
      sd_dou2[i, j] <- stringdist(tx[i], tx[pos_dou2[i, j]])
    }
  }
  if (i %in% mseq) {
    print(i)
    flush.console()
  }
}

ddiag2$min_textdist <- rowMins(sd_dou, na.rm = TRUE)
ddiag2$min_textdist <- ifelse(ddiag2$min_textdist > ddiag2$tx_length | ddiag2$min_textdist == 9999, ddiag2$tx_length, ddiag2$min_textdist)
ddiag2$hlsim <- (ddiag2$hl_length - ddiag2$min_strdist) / ddiag2$hl_length
ddiag2$txsim <- (ddiag2$tx_length - ddiag2$min_textdist) / ddiag2$tx_length

ddiag2$article_position <- 1:dim(ddiag2)[1]
ddiag2$doublette_position <- pos_dou[, 1]

ddiag2$kill <- ifelse(ddiag2$article_position %in% unique(ddiag2$doublette_position), 1, 0)

table(ddiag$min_strdist == 0, ddiag$min_textdist == 0)

table(ddiag$hlsim == 1, ddiag$txsim == 1)

#  save(ddiag,file="doublette_diagnosis.RData")

3.2 (Old spellchecking procedure [self programmed])

Code
# load(".\\ftx2.RData")
# load(".\\cp.RData")
# setwd(".\\crisis2")

places <- read.csv("placenames.csv")
cities <- read.csv2("worldcities.csv")

placenames <- c(
  str_extract(places$Coverage..City, pattern = "[:alpha:]*"),
  str_extract(str_replace(places$places, "The ", ""), pattern = "[:alpha:]*"),
  str_extract(cities$city_ascii, pattern = "[:alpha:]*"),
  str_extract(cities$country, pattern = "[:alpha:]*[:space:]{0,1}[:alpha:]*")
)

placenames2 <- unique(placenames)

ftx$newspaper <- "The Times"

catchwords <- c("crisis", "crises", "catastrophic", "catastrophe", "catastrophes", "disaster", "disasters", "disastrous", "pandemic", "pandemics", "epidemic", "epidemics", "recession", "recessions", "turmoil", "turmoils", "tragedy", "tragic", "tragedies", "collapse", "collapses", "collapsed", "debacle", "debacles", "emergency", "emergencies", "turmoil", "turmoils", "chaos", "havoc", "mess", "bedlam", "ravage", "ravages")

cp2 <- cp

# Replace all hyphens, also those with surplus whitespace. Takes approximately 2min
counterhits <- seq(1000, length(cp), 1000)
for (i in 1:length(cp))
{
  cp2[[i]] <- str_replace_all(string = cp[[i]], pattern = "[:space:]*\\-[:space:]*", replacement = "")
  if (i %in% counterhits) {
    flush.console()
    print(paste0(i))
  }
}

# cp2 <- str_replace_all(string=cp,pattern="[:space:]*\\-[:space:]*",replacement="")

# cp2_low <- str_to_lower(cp2)
# More clearning and creation of dfm. Takes approximately 30 min
tok2 <- tokens(cp2, remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = FALSE, remove_separators = TRUE)
tok_low <- tokens_tolower(tok2)


# dtm_low <- dfm(tok_low)

# save(dtm_low,file="dtm_low.RData")

# words <- dtm_low@Dimnames$features


# Load an English vocabulary to compare "nonwords" in the texts to.

setwd(".//crisis")
# setwd(".//crisis")

endict <- read.csv(file = "ENGVOC_lo.txt")
endict <- as.character(endict[, 1]) # ensure that the vocabulary list is represented as character vector.
sw <- c(
  "oclock", "minut", "yard", "morn", "half-past", "dont", "don't", "say", "did", "didn't", "get", "want", "just",
  "think", "cant", "cannot", "theyr", "wasnt", "wouldnt", "wasn't", "wouldn't", "havent", "haven't", "shouldn't", "should not",
  "ing", "tion", "con", "com", "pro", "ã‚â£", "ã‚â£m", "â£m", "•", "â–º", "“the", "britain’", "¬", "â", "—"
)
# stopwords that are not meaningful for the topic models and should be removed from the Document term matrix for the Topic Models.
# always remove as late as possible.

person.names <- as.matrix(babynames[, 3])
person_names <- tolower(unique(person.names))
surnames <- tolower(surnames2010[, 1])
fullwordlist <- c(endict, person_names, surnames, placenames2)



dictionary1 <- fullwordlist
dictionary2 <- endict

dictio <- as.list(as.character(dictionary2))
names(dictio) <- as.character(dictionary2)
dictionary2.0 <- dictionary(dictio)
random.001.percent <- runif(min = 0, max = 100, n = 487659)

# save(dictionary2.0,file="dict2.RData")

wordfreq <- tokens_lookup(tokens_subset(tok_low, random.001.percent > 99), dictionary2.0)

dfm_wordfreq <- dfm(wordfreq)

dictionary2.1 <- data.frame(word = dictionary2, count = colSums(dfm_wordfreq)[match(dictionary2, colnames(dfm_wordfreq))])

dictionary1.1 <- data.frame(word = c(dictionary1), count = c(colSums(dfm_wordfreq)[match(dictionary2, colnames(dfm_wordfreq))], rep(1, times = length(dictionary1) - length(dictionary2))))

# save(wordfreq,file="wordfreq.RData")

# string <- tok_low[[3]]

cleaned <- list()

for (i in 474016:length(tok_low))
{
  cleaned[[i]] <- substitute_words(string = tok_low[[i]][1:50], dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
  print(i)
  flush.console()
}

#  matchsum <- function(x,set){
#  string <- x[[1]]
#  matcher <- match(string,set)
#  matchcount <- sum(matcher,na.rm=TRUE)
#  return(matchcount)
#  }

cleaned.match <- lapply(cleaned, matchsum, catchwords)
cleaned.match.v <- unlist(cleaned.match)

# headline.match <-

# save(cleaned,file="cl_16k.RData")
# save(cleaned,file="cl_295k.RData")
# save(cleaned,file="cl_474k.RData")
# save(cleaned,file="cl_488k.RData")
# save(tok_low,file="tok_low.RData")
# save(dictionary1,file="dictionary1.RData")
# save(dictionary1.1,file="dictionary1_1.RData")

cleaned <- lapply(get50(tok_low[1:2]), substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)


replacement_position <- which(((sdm == t(matrix(rep(romi, times = dim(sdm)[1]), nrow = 6))) * wf) == colMaxs((sdm == t(matrix(rep(romi, times = dim(sdm)[1]), nrow = 6))) * (wf)))
#   rep(dictionary2,times=ncol(sdm))[which(replacement_position==1)]

#   return(substituted)
# }

ftx$newspaper <- "The Times"

catchwords <- c("crisis", "crises", "catastrophic", "catastrophe", "catastrophes", "disaster", "disasters", "disastrous", "pandemic", "pandemics", "epidemic", "epidemics", "recession", "recessions", "breakdown", "breakdowns", "collapse", "collapses", "collapsed", "debacle", "debacles", "emergency", "emergencies")

cp2 <- cp

# Replace all hyphens, also those with surplus whitespace. Takes approximately 2min
counterhits <- seq(1000, length(cp), 1000)
for (i in 1:length(cp))
{
  cp2[[i]] <- str_replace_all(string = cp[[i]], pattern = "[:space:]*\\-[:space:]*", replacement = "")
  if (i %in% counterhits) {
    flush.console()
    print(paste0(i))
  }
}

# cp2 <- str_replace_all(string=cp,pattern="[:space:]*\\-[:space:]*",replacement="")

# cp2_low <- str_to_lower(cp2)
# More clearning and creation of dfm. Takes approximately 30 min
tok2 <- tokens(cp2, remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = FALSE, remove_separators = TRUE)
tok_low <- tokens_tolower(tok2)
dtm_low <- dfm(tok_low)

# save(tok_low,file="tok_low.RData")
# save(dtm_low,file="dtm_low.RData")

words <- dtm_low@Dimnames$features

nonfirstnames <- sapply(words, marknondict, dictionary = person_names)
nonsurnames <- sapply(words, marknondict, dictionary = surnames)
nonlexical <- sapply(words, marknondict, dictionary = endict)
nonwords <- sapply(words, marknondict, dictionary = fullwordlist)

table(nonfirstnames) #
table(nonsurnames) #
table(nonlexical) #
table(nonwords) #

catcher <- data.frame(headline = rep(NA, times = length(tok_low) + length(ftok)), text50 = rep(NA, times = length(tok_low) + length(ftok)))

headline_tokens <- tokens(docvars(cp)$headline, remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE)
hl_tokens <- tokens_tolower(headline_tokens)

I <- length(tok_low)

iseq <- seq(0, length(tok_low), 1000)

text50.1 <- lapply(tok_low, firstwords, nwords = 50)
text50.2 <- lapply(ftok, firstwords, nwords = 50)
text50 <- c(text50.1, text50.2)

text100.1 <- pblapply(tok_low, firstwords, nwords = 100)
text100.2 <- pblapply(ftok, firstwords, nwords = 100)
text100 <- c(text100.1, text100.2)


head50.1 <- docvars(cp)$headline
head50.2 <- str_replace_all(ftx$headline, "\\n", " ")
head50 <- c(head50.1, head50.2)

head50 <- str_to_lower(head50)


catcher$headline <- unlist(lapply(head50, str_counter, pattern = catchwords))
catcher$text50 <- unlist(lapply(text50, str_counter, pattern = catchwords))
catcher$select <- catcher$headline + catcher$text50

catcher$id <- c(docvars(cp)$id, ftx$id)
catcher$year <- c(docvars(cp)$year, ftx$year)
catcher$newspaper <- c(docvars(cp)$newspaper, ftx$newspaper)

catcher2 <- subset(catcher, year < 2015 & newspaper == "The Times." | year > 2014 & newspaper == "The Times")

for (i in 1:length(tok2)) {
  txt <- paste(c(tok2[[i]][1:50], hl_tokens[[i]]), collapse = " ")
  catcher[i] <- 13 - sum(is.na(str_match(string = txt, pattern = catchwords)), na.rm = TRUE)
  flush.console()
  print(paste0(i))
}

docvars(fcp)$id <- ftx$id
docvars(fcp)$date <- ftx$date
docvars(fcp)$newspaper <- ftx$newspaper
docvars(fcp)$headline <- ftx$headline
docvars(fcp)$genre <- NA
docvars(fcp)$place <- "London, United Kingdom"
docvars(fcp)$publisher <- NA
docvars(fcp)$author <- ftx$author
docvars(fcp)$archive <- "Factiva"
docvars(fcp)$type <- "Newspaper"
docvars(fcp)$year <- ftx$year
docvars(fcp)$day <- as.numeric(str_extract(ftx$date, "^[:digit:]{1,2}"))
docvars(fcp)$month <- str_extract(ftx$date, "[:alpha:]{2,5}")
docvars(fcp)$Month <- as.numeric(Recode(docvars(fcp)$month, "'Janua'=1;'Febru'=2;'MÃ'=3;'April'=4;'Mai'=5;'Juni'=6;'Juli'=7;'Augus'=8;'Septe'=9;'Oktob'=10;'Novem'=11;'Dezem'=12"))
docvars(fcp)$time <- paste0(docvars(fcp)$year, "-", docvars(fcp)$Month, "-", docvars(fcp)$day)
docvars(fcp)$Time <- paste0(docvars(fcp)$year, "-", docvars(fcp)$month, "-", docvars(fcp)$day)

tcp <- c(cp, fcp)
ttok <- c(tok_low, ftok)

catchdict <- dictionary(list(catchwords = catchwords))

x <- tokens_lookup(x = tok_low, dictionary = catchdict)
dfm_ca <- dfm(x)

y <- pblapply(text50, str_counter, pattern = catchwords)
table(unlist(y))

yy <- pblapply(text100, str_counter, pattern = catchwords)
table(unlist(yy))

z <- pblapply(head50, str_counter, pattern = catchwords)
table(unlist(z))

x <- pblapply(tok_low, str_counter, pattern = catchwords)

docvars(tcp)$selector <- unlist(yy) + unlist(z)
docvars(ttok) <- docvars(tcp)

acp <- corpus_subset(tcp, selector > 0)
atok <- tokens_subset(ttok, selector > 0)

plot(table(docvars(atok)$year))

# A demonstration of word substitution mechanics
demo_subst <- substitute_words(string = tok_low[[i]][1:50], dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)

# Substitution of all OCR errors with most likely word from the dictionary
## Dictionary 1 is the dictionary that includes all words that most likely are real words. It includes, for example, place names and person names.
## Dictionary 2 is the dictionary from which alternative words are considered. Here, we include only frequent words
## This is split into 18 sets of texts because the procedure is very memory-intensive and very time-consuming.

catok.01 <- pblapply(atok[1:10000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#    save(catok.01,file="catok01.RData")

catok.02 <- pblapply(atok[10001:20000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#    save(catok.02,file="catok02.RData")

catok.03 <- pblapply(atok[20001:30000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#    save(catok.03,file="catok03.RData")

catok.04 <- pblapply(atok[30001:40000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#    save(catok.04,file="catok04.RData")

catok.05 <- pblapply(atok[40001:50000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#    save(catok.05,file="catok05.RData")

catok.06 <- pblapply(atok[50001:60000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#    save(catok.06,file="catok06.RData")

catok.07b <- pblapply(atok[65001:70000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#    save(catok.07b,file="catok07b.RData")

catok.07a <- pblapply(atok[60001:65000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#    save(catok.07a,file="catok07a.RData")

catok.08 <- pblapply(atok[70001:80000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#    save(catok.08,file="catok08.RData")

catok.09 <- pblapply(atok[80001:90000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#    save(catok.09,file="catok09.RData")

catok.10 <- pblapply(atok[90001:100000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#    save(catok.10,file="catok10.RData")

catok.11 <- pblapply(atok[100001:110000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#    save(catok.11,file="catok11.RData")

catok.12 <- pblapply(atok[110001:120000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#   save(catok.12,file="catok12.RData")

catok.13 <- pblapply(atok[120001:130000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#   save(catok.13,file="catok13.RData")

catok.14 <- pblapply(atok[130001:140000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#   save(catok.14,file="catok14.RData")

catok.15 <- pblapply(atok[140001:150000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#   save(catok.15,file="catok15.RData")

catok.16 <- pblapply(atok[150001:160000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#   save(catok.16,file="catok16.RData")

catok.17 <- pblapply(atok[160001:170000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#   save(catok.17,file="catok17.RData")

catok.18 <- pblapply(atok[170001:180000], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#   save(catok.18,file="catok18.RData")

catok.19 <- pblapply(atok[180001:187573], substitute_words, dictionary1 = dictionary1, dictionary2 = dictionary1, dictionary2.1 = dictionary1.1)
#   save(catok.19,file="catok19.RData")

3.3 Spellcheck texts and save different text representations [hunspell]

Code
### Add names, organizations, groups, geolocations to dictionary for spellchecking; using spacyR and hunspell

### Extract entities from raw texts --- takes LOOONG for the entire set, OK for 10000 texts sample, which is sufficient.

spacyr::spacy_initialize(model = "en_core_web_sm", python_executable = "C:\\Anaconda\\python.exe")
random_text_sample <- sample(tms_tx_u$text, size = 10000, replace = TRUE)
entities <- spacyr::spacy_extract_entity(random_text_sample) # roughly 10 min
entities_rev <- entities
entities_rev$text <- str_remove_all(entities$text, "[^[äüöÄÜÖß]^[a-z]^[A-Z]^[:digit:]^[:blank:]]")
spacyr::spacy_finalize()

str(entities_rev)
table(entities_rev$ent_type)

table_EVENT <- table(subset(entities_rev, ent_type == "EVENT")$text)
table_GPE <- table(subset(entities_rev, ent_type == "GPE")$text)
table_LAW <- table(subset(entities_rev, ent_type == "LAW")$text)
table_LOC <- table(subset(entities_rev, ent_type == "LOC")$text)
table_MONEY <- table(subset(entities_rev, ent_type == "MONEY")$text)
table_NORP <- table(subset(entities_rev, ent_type == "NORP")$text)
table_PERSON <- table(subset(entities_rev, ent_type == "PERSON")$text)
table_PRODUCT <- table(subset(entities_rev, ent_type == "PRODUCT")$text)
table_ORG <- table(subset(entities_rev, ent_type == "ORG")$text)
table_WORK_OF_ART <- table(subset(entities_rev, ent_type == "WORK_OF_ART")$text)

sorted_PERSON <- table_PERSON[order(table_PERSON, decreasing = TRUE)]
sorted_GPE <- table_GPE[order(table_GPE, decreasing = TRUE)]
sorted_LAW <- table_LAW[order(table_LAW, decreasing = TRUE)]
sorted_LOC <- table_LOC[order(table_LOC, decreasing = TRUE)]
sorted_MONEY <- table_MONEY[order(table_MONEY, decreasing = TRUE)]
sorted_NORP <- table_NORP[order(table_NORP, decreasing = TRUE)]
sorted_PRODUCT <- table_PRODUCT[order(table_PRODUCT, decreasing = TRUE)]
sorted_ORG <- table_ORG[order(table_ORG, decreasing = TRUE)]
sorted_WORK_OF_ART <- table_WORK_OF_ART[order(table_WORK_OF_ART, decreasing = TRUE)]

vocab_PERSON <- unlist(str_split(names(sorted_PERSON[sorted_PERSON > 2]), " "))
vocab_GPE <- unlist(str_split(names(sorted_GPE[sorted_GPE > 2]), " "))
vocab_LAW <- unlist(str_split(names(sorted_LAW[sorted_LAW > 2]), " "))
vocab_LOC <- unlist(str_split(names(sorted_LOC[sorted_LOC > 2]), " "))
vocab_MONEY <- unlist(str_split(names(sorted_MONEY[sorted_MONEY > 2]), " "))
vocab_NORP <- unlist(str_split(names(sorted_NORP[sorted_NORP > 2]), " "))
vocab_PRODUCT <- unlist(str_split(names(sorted_PRODUCT[sorted_PRODUCT > 2]), " "))
vocab_ORG <- unlist(str_split(names(sorted_ORG[sorted_ORG > 2]), " "))
vocab_WORK_OF_ART <- unlist(str_split(names(sorted_WORK_OF_ART[sorted_WORK_OF_ART > 2]), " "))

vocab_additions <- c(vocab_PERSON, vocab_GPE, vocab_LAW, vocab_LOC, vocab_MONEY, vocab_NORP, vocab_PRODUCT, vocab_ORG, vocab_WORK_OF_ART)

### Simplify the text to avoid error messages from unexpected symbols.

simplifiedText <- pblapply(tms_tx_u$text, str_remove_all, "[^[äüöÄÜÖß]^[a-z]^[A-Z]^[:digit:]^[\\.\\,\\;\\!\\?]^[:blank:]]")

simplifiedText <- pblapply(simplifiedText, str_replace_all, "\\.", ". ")
simplifiedText <- pblapply(simplifiedText, str_replace_all, ", ", ". ")
simplifiedText <- pblapply(simplifiedText, str_replace_all, ";", ". ")
simplifiedText <- pblapply(simplifiedText, str_replace_all, "\\?", ". ")
simplifiedText <- pblapply(simplifiedText, str_replace_all, "!", ". ")

simplifiedTextDepunct <- pblapply(tms_tx_u$text, str_remove_all, "[^[äüöÄÜÖß]^[a-z]^[A-Z]^[:digit:]^[:blank:]]")

simplifiedTextDepunct <- pblapply(simplifiedTextDepunct, str_replace_all, "\\.", ". ")
simplifiedTextDepunct <- pblapply(simplifiedTextDepunct, str_replace_all, ",", ". ")
simplifiedTextDepunct <- pblapply(simplifiedTextDepunct, str_replace_all, ";", ". ")
simplifiedTextDepunct <- pblapply(simplifiedTextDepunct, str_replace_all, "\\?", ". ")
simplifiedTextDepunct <- pblapply(simplifiedTextDepunct, str_replace_all, "!", ". ")

### Load dictionaries for spellchecking. Hunspell dictionaries.

EN_dict <- hunspell::dictionary(".//dictionaries//en_EN.dic")
US_dict <- hunspell::dictionary(".//dictionaries//en_US.dic")
DE_dict <- hunspell::dictionary(".//dictionaries//index.dic")
empty_dict <- hunspell::dictionary(".//dictionaries//empty.dic")
empty_punct <- hunspell::dictionary(".//dictionaries//empty.dic", add_words = c("xPUNKTx", "xKOMMAx", "xSEMIx", "xFRAGEx", "xAUSRUFx"))


# DE_punct <- hunspell::dictionary("index.dic",add_words=c("xPUNKTx", "xKOMMAx", "xSEMIx", "xFRAGEx", "xAUSRUFx"))
# DE_ent <- hunspell::dictionary("index.dic",add_words=c(vocab_LOC,vocab_PER,vocab_ORG,vocab_MSC))
# DE_punct_ent <- hunspell::dictionary("index.dic",add_words=c(vocab_LOC,vocab_PER,vocab_ORG,vocab_MSC,c("xPUNKTx", "xKOMMAx", "xSEMIx", "xFRAGEx", "xAUSRUFx")))
# empty_punct <- hunspell::dictionary("empty.dic",add_words=c("xPUNKTx", "xKOMMAx", "xSEMIx", "xFRAGEx", "xAUSRUFx"))

### If hunspell stuggests several words that have the same string distance (=deviation between the original word and the suggested word), 1 word is sampled randomly from the set of minimum string distance words (alternatively, the first suggestion can be extracted)

### Spellcheck the text

raw_punct_tokens <- tokens(unlist(simplifiedText), what = "word", remove_punct = FALSE, remove_symbols = TRUE, remove_numbers = FALSE, split_hyphens = FALSE, remove_separators = TRUE, include_docvars = FALSE)

raw_tokens <- tokens(unlist(simplifiedText), what = "word", remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = FALSE, split_hyphens = FALSE, remove_separators = TRUE, include_docvars = FALSE)
raw_vocabulary <- quanteda::types(raw_tokens)
checked_vocabulary <- cbind(raw_vocabulary, hunspell_check(raw_vocabulary, dict = EN_ent))

errors <- checked_vocabulary[checked_vocabulary[, 2] == FALSE, 1]
corrections_all <- pblapply(errors, hunspell_suggest, dict = EN_ent) ### This is the most time-intensive procedure, approximately 1-2 days of computation.
flat_corrections_all <- flatten(corrections_all)
corrections_first <- unlist(pblapply(flat_corrections_all, FUN = "[", 1))

corrections_first_NA <- ifelse(is.na(corrections_first), errors, corrections_first)

checked_voc <- data.frame(checked_vocabulary, correct_vocabulary = "NA")
checked_voc[checked_voc$V2 == FALSE, "correct_vocabulary"] <- corrections_first
checked_voc$correct_words <- ifelse(checked_voc$correct_vocabulary == "NA", checked_voc$raw_vocabulary, checked_voc$correct_vocabulary)

spellchecked_tokens <- tokens_replace(raw_tokens, pattern = errors, replacement = corrections_first_NA, valuetype = "fixed", case_insensitive = FALSE)

spellchecked_punct_tokens <- tokens_replace(raw_punct_tokens, pattern = errors, replacement = corrections_first_NA, valuetype = "fixed", case_insensitive = FALSE)

3.4 Hunspell NC

Code
### Add names, organizations, groups, geolocations to dictionary for spellchecking; using spacyR and hunspell

### Extract entities from raw texts --- takes LOOONG for the entire set, OK for 10000 texts sample, which is sufficient.

tmsnc_tx_u <- nc.r

spacyr::spacy_install()
spacyr::spacy_initialize(model = "en_core_web_sm")
random_text_sample <- sample(tmsnc_tx_u$text, size = 10000, replace = TRUE)
entities <- spacyr::spacy_extract_entity(random_text_sample) # roughly 10 min
entities_rev <- entities
entities_rev$text <- str_remove_all(
  entities$text,
  "[^[äüöÄÜÖß]^[a-z]^[A-Z]^[:digit:]^[:blank:]]"
)
spacyr::spacy_finalize()

str(entities_rev)
table(entities_rev$ent_type)

table_EVENT <- table(subset(entities_rev, ent_type == "EVENT")$text)
table_GPE <- table(subset(entities_rev, ent_type == "GPE")$text)
table_LAW <- table(subset(entities_rev, ent_type == "LAW")$text)
table_LOC <- table(subset(entities_rev, ent_type == "LOC")$text)
table_MONEY <- table(subset(entities_rev, ent_type == "MONEY")$text)
table_NORP <- table(subset(entities_rev, ent_type == "NORP")$text)
table_PERSON <- table(subset(entities_rev, ent_type == "PERSON")$text)
table_PRODUCT <- table(subset(entities_rev, ent_type == "PRODUCT")$text)
table_ORG <- table(subset(entities_rev, ent_type == "ORG")$text)
table_WORK_OF_ART <- table(subset(entities_rev, ent_type == "WORK_OF_ART")$text)

sorted_PERSON <- table_PERSON[order(table_PERSON, decreasing = TRUE)]
sorted_GPE <- table_GPE[order(table_GPE, decreasing = TRUE)]
sorted_LAW <- table_LAW[order(table_LAW, decreasing = TRUE)]
sorted_LOC <- table_LOC[order(table_LOC, decreasing = TRUE)]
sorted_MONEY <- table_MONEY[order(table_MONEY, decreasing = TRUE)]
sorted_NORP <- table_NORP[order(table_NORP, decreasing = TRUE)]
sorted_PRODUCT <- table_PRODUCT[order(table_PRODUCT, decreasing = TRUE)]
sorted_ORG <- table_ORG[order(table_ORG, decreasing = TRUE)]
sorted_WORK_OF_ART <- table_WORK_OF_ART[order(table_WORK_OF_ART, decreasing = TRUE)]

vocab_PERSON <- unlist(str_split(names(sorted_PERSON[sorted_PERSON > 2]), " "))
vocab_GPE <- unlist(str_split(names(sorted_GPE[sorted_GPE > 2]), " "))
vocab_LAW <- unlist(str_split(names(sorted_LAW[sorted_LAW > 2]), " "))
vocab_LOC <- unlist(str_split(names(sorted_LOC[sorted_LOC > 2]), " "))
vocab_MONEY <- unlist(str_split(names(sorted_MONEY[sorted_MONEY > 2]), " "))
vocab_NORP <- unlist(str_split(names(sorted_NORP[sorted_NORP > 2]), " "))
vocab_PRODUCT <- unlist(str_split(names(sorted_PRODUCT[sorted_PRODUCT > 2]), " "))
vocab_ORG <- unlist(str_split(names(sorted_ORG[sorted_ORG > 2]), " "))
vocab_WORK_OF_ART <- unlist(str_split(names(sorted_WORK_OF_ART[sorted_WORK_OF_ART > 2]), " "))

vocab_additions <- c(
  vocab_PERSON, vocab_GPE, vocab_LAW, vocab_LOC,
  vocab_MONEY, vocab_NORP, vocab_PRODUCT, vocab_ORG, vocab_WORK_OF_ART
)

### Simplify the text to avoid error messages from unexpected symbols. ETA 5-10 min.

simplifiedText <- pblapply(
  tmsnc_tx_u$text, str_remove_all,
  "[^[äüöÄÜÖß]^[a-z]^[A-Z]^[:digit:]^[\\.\\,\\;\\!\\?]^[:blank:]]"
)

simplifiedText <- pblapply(simplifiedText, str_replace_all, "\\.", ". ")
simplifiedText <- pblapply(simplifiedText, str_replace_all, ", ", ". ")
simplifiedText <- pblapply(simplifiedText, str_replace_all, ";", ". ")
simplifiedText <- pblapply(simplifiedText, str_replace_all, "\\?", ". ")
simplifiedText <- pblapply(simplifiedText, str_replace_all, "!", ". ")

simplifiedTextDepunct <- pblapply(
  tmsnc_tx_u$text, str_remove_all,
  "[^[äüöÄÜÖß]^[a-z]^[A-Z]^[:digit:]^[:blank:]]"
)

simplifiedTextDepunct <- pblapply(simplifiedTextDepunct, str_replace_all, "\\.", ". ")
simplifiedTextDepunct <- pblapply(simplifiedTextDepunct, str_replace_all, ",", ". ")
simplifiedTextDepunct <- pblapply(simplifiedTextDepunct, str_replace_all, ";", ". ")
simplifiedTextDepunct <- pblapply(simplifiedTextDepunct, str_replace_all, "\\?", ". ")
simplifiedTextDepunct <- pblapply(simplifiedTextDepunct, str_replace_all, "!", ". ")

### Load dictionaries for spellchecking. Hunspell dictionaries.

EN_dict <- hunspell::dictionary("..//..//..//crisis_collab//dictionaries//en_GB.dic")
US_dict <- hunspell::dictionary("..//..//..//crisis_collab//dictionaries//en_US.dic")
DE_dict <- hunspell::dictionary("..//..//..//crisis_collab//dictionaries//de_DE.dic")
empty_dict <- hunspell::dictionary("..//..//..//crisis_collab//dictionaries//empty.dic")
empty_punct <- hunspell::dictionary("..//..//..//crisis_collab//dictionaries//empty.dic", add_words = c("xPUNKTx", "xKOMMAx", "xSEMIx", "xFRAGEx", "xAUSRUFx"))


# DE_punct <- hunspell::dictionary("index.dic",add_words=c("xPUNKTx", "xKOMMAx", "xSEMIx", "xFRAGEx", "xAUSRUFx"))
# DE_ent <- hunspell::dictionary("index.dic",add_words=c(vocab_LOC,vocab_PER,vocab_ORG,vocab_MSC))
# DE_punct_ent <- hunspell::dictionary("index.dic",add_words=c(vocab_LOC,vocab_PER,vocab_ORG,vocab_MSC,c("xPUNKTx", "xKOMMAx", "xSEMIx", "xFRAGEx", "xAUSRUFx")))
# empty_punct <- hunspell::dictionary("empty.dic",add_words=c("xPUNKTx", "xKOMMAx", "xSEMIx", "xFRAGEx", "xAUSRUFx"))

### If hunspell stuggests several words that have the same string distance (=deviation between the original word and the suggested word), 1 word is sampled randomly from the set of minimum string distance words (alternatively, the first suggestion can be extracted)

### Spellcheck the text

raw_punct_tokens <- tokens(unlist(simplifiedText), what = "word", remove_punct = FALSE, remove_symbols = TRUE, remove_numbers = FALSE, split_hyphens = FALSE, remove_separators = TRUE, include_docvars = FALSE)

raw_tokens <- tokens(unlist(simplifiedText), what = "word", remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = FALSE, split_hyphens = FALSE, remove_separators = TRUE, include_docvars = FALSE)
raw_vocabulary <- quanteda::types(raw_tokens)
checked_vocabulary <- cbind(raw_vocabulary, hunspell_check(raw_vocabulary, dict = EN_ent))

errors <- checked_vocabulary[checked_vocabulary[, 2] == FALSE, 1]
corrections_all <- pblapply(errors, hunspell_suggest, dict = EN_ent) ### This is the most time-intensive procedure, approximately 1-2 days of computation.
flat_corrections_all <- flatten(corrections_all)
corrections_first <- unlist(pblapply(flat_corrections_all, FUN = "[", 1))

corrections_first_NA <- ifelse(is.na(corrections_first), errors, corrections_first)

checked_voc <- data.frame(checked_vocabulary, correct_vocabulary = "NA")
checked_voc[checked_voc$V2 == FALSE, "correct_vocabulary"] <- corrections_first
checked_voc$correct_words <- ifelse(checked_voc$correct_vocabulary == "NA", checked_voc$raw_vocabulary, checked_voc$correct_vocabulary)

spellchecked_tokens <- tokens_replace(raw_tokens, pattern = errors, replacement = corrections_first_NA, valuetype = "fixed", case_insensitive = FALSE)

spellchecked_punct_tokens <- tokens_replace(raw_punct_tokens, pattern = errors, replacement = corrections_first_NA, valuetype = "fixed", case_insensitive = FALSE)

3.5 Create text relevant representations in various formats

Code
##### CRISIS CORPUS

### Create consistent text representations with metadata based on TOKENS objects

tms_tk_u <- raw_punct_tokens
docvars(tms_tk_u) <- tms_tx_u[, names(tms_tx_u) != "text"]

tms_tk_pu <- raw_tokens
docvars(tms_tk_pu) <- tms_tx_u[, names(tms_tx_u) != "text"]

tms_tk_su <- spellchecked_punct_tokens
docvars(tms_tk_su) <- tms_tx_u[, names(tms_tx_u) != "text"]

tms_tk_spu <- spellchecked_tokens
docvars(tms_tk_spu) <- tms_tx_u[, names(tms_tx_u) != "text"]

tms_tk <- tokens_tolower(tms_tk_u, keep_acronyms = TRUE)
tms_tk_s <- tokens_tolower(tms_tk_su, keep_acronyms = TRUE)
tms_tk_p <- tokens_tolower(tms_tk_pu, keep_acronyms = TRUE)
tms_tk_sp <- tokens_tolower(tms_tk_spu, keep_acronyms = TRUE)

tms_tk_pl <- tokens_replace(tms_tk_p, pattern = hash_lemma_en$token, replace = hash_lemma_en$lemma, valuetype = "fixed")
tms_tk_spl <- tokens_replace(tms_tk_sp, pattern = hash_lemma_en$token, replace = hash_lemma_en$lemma, valuetype = "fixed")

tms_cp_u <- corpus(vapply(tms_tk_u, paste, FUN.VALUE = character(1), collapse = " "))
docvars(tms_cp_u) <- docvars(tms_tk_u)
tms_cp_pu <- corpus(vapply(tms_tk_pu, paste, FUN.VALUE = character(1), collapse = " "))
docvars(tms_cp_pu) <- docvars(tms_tk_u)
tms_cp_su <- corpus(vapply(tms_tk_su, paste, FUN.VALUE = character(1), collapse = " "))
docvars(tms_cp_su) <- docvars(tms_tk_u)
tms_cp_spu <- corpus(vapply(tms_tk_spu, paste, FUN.VALUE = character(1), collapse = " "))
docvars(tms_cp_spu) <- docvars(tms_tk_u)
tms_cp <- corpus(vapply(tms_tk, paste, FUN.VALUE = character(1), collapse = " "))
docvars(tms_cp) <- docvars(tms_tk_u)
tms_cp_p <- corpus(vapply(tms_tk_p, paste, FUN.VALUE = character(1), collapse = " "))
docvars(tms_cp_p) <- docvars(tms_tk_u)
tms_cp_s <- corpus(vapply(tms_tk_s, paste, FUN.VALUE = character(1), collapse = " "))
docvars(tms_cp_s) <- docvars(tms_tk_u)
tms_cp_sp <- corpus(vapply(tms_tk_sp, paste, FUN.VALUE = character(1), collapse = " "))
docvars(tms_cp_sp) <- docvars(tms_tk_u)

tms_tx_u <- data.frame(docvars(tms_cp_u), text = as.character(tms_cp_u))
tms_tx_pu <- data.frame(docvars(tms_cp_pu), text = as.character(tms_cp_pu))
tms_tx_su <- data.frame(docvars(tms_cp_su), text = as.character(tms_cp_su))
tms_tx_spu <- data.frame(docvars(tms_cp_spu), text = as.character(tms_cp_spu))

tms_tx <- data.frame(docvars(tms_cp), text = as.character(tms_cp))
tms_tx_p <- data.frame(docvars(tms_cp_p), text = as.character(tms_cp_p))
tms_tx_s <- data.frame(docvars(tms_cp_s), text = as.character(tms_cp_s))
tms_tx_sp <- data.frame(docvars(tms_cp_sp), text = as.character(tms_cp_sp))

tms_qdfm_p <- dfm(tms_tk_p)
tms_qdfm_pc <- dfm_remove(tms_qdfm_p, pattern = c(stopwords("en")))
tms_qdfm_sp <- dfm(tms_tk_sp)
tms_qdfm_spc <- dfm_remove(tms_qdfm_sp, pattern = c(stopwords("en")))

tms_qdfm_pl <- dfm(tms_tk_pl)
tms_qdfm_plc <- dfm_remove(tms_qdfm_p, pattern = c(stopwords("en")))
tms_qdfm_spl <- dfm(tms_tk_spl)
tms_qdfm_splc <- dfm_remove(tms_qdfm_sp, pattern = c(stopwords("en")))


tms_qdfm_pc_t100 <- dfm_trim(tms_qdfm_pc, min_docfreq = 0.01, docfreq_type = "prop")
tms_qdfm_pc_t1000 <- dfm_trim(tms_qdfm_pc, min_docfreq = 0.001, docfreq_type = "prop")
tms_qdfm_plc_t100 <- dfm_trim(tms_qdfm_plc, min_docfreq = 0.01, docfreq_type = "prop")
tms_qdfm_plc_t1000 <- dfm_trim(tms_qdfm_plc, min_docfreq = 0.001, docfreq_type = "prop")
tms_qdfm_spc_t100 <- dfm_trim(tms_qdfm_spc, min_docfreq = 0.01, docfreq_type = "prop")
tms_qdfm_spc_t1000 <- dfm_trim(tms_qdfm_spc, min_docfreq = 0.001, docfreq_type = "prop")
tms_qdfm_splc_t100 <- dfm_trim(tms_qdfm_splc, min_docfreq = 0.01, docfreq_type = "prop")
tms_qdfm_splc_t1000 <- dfm_trim(tms_qdfm_splc, min_docfreq = 0.001, docfreq_type = "prop")

tms_sdfm_pc <- convert(tms_qdfm_pc, to = "stm", docvars = docvars(tms_qdfm_pc))
tms_sdfm_pc_t100 <- convert(tms_qdfm_pc_t100, to = "stm", docvars = docvars(tms_qdfm_pc_t100))
tms_sdfm_pc_t1000 <- convert(tms_qdfm_pc_t1000, to = "stm", docvars = docvars(tms_qdfm_pc_t1000))
tms_sdfm_spc <- convert(tms_qdfm_spc, to = "stm", docvars = docvars(tms_qdfm_spc))
tms_sdfm_spc_t100 <- convert(tms_qdfm_spc_t100, to = "stm", docvars = docvars(tms_qdfm_spc_t100))
tms_sdfm_spc_t1000 <- convert(tms_qdfm_spc_t1000, to = "stm", docvars = docvars(tms_qdfm_spc_t1000))
tms_sdfm_splc <- convert(tms_qdfm_spc, to = "stm", docvars = docvars(tms_qdfm_spc))
tms_sdfm_splc_t100 <- convert(tms_qdfm_splc_t100, to = "stm", docvars = docvars(tms_qdfm_splc_t100))
tms_sdfm_splc_t1000 <- convert(tms_qdfm_splc_t1000, to = "stm", docvars = docvars(tms_qdfm_splc_t1000))

### Save the text representations to files

save(tms_tx_u, file = "tms_tx_u.RData")
save(tms_tx_pu, file = "tms_tx_pu.RData")
save(tms_tx_su, file = "tms_tx_su.RData")
save(tms_tx_spu, file = "tms_tx_spu.RData")
save(tms_tx, file = "tms_tx.RData")
save(tms_tx_p, file = "tms_tx_p.RData")
save(tms_tx_s, file = "tms_tx_s.RData")
save(tms_tx_sp, file = "tms_tx_sp.RData")

save(tms_cp_u, file = "tms_cp_u.RData")
save(tms_cp_pu, file = "tms_cp_pu.RData")
save(tms_cp_su, file = "tms_cp_su.RData")
save(tms_cp_spu, file = "tms_cp_spu.RData")
save(tms_cp, file = "tms_cp.RData")
save(tms_cp_p, file = "tms_cp_p.RData")
save(tms_cp_s, file = "tms_cp_s.RData")
save(tms_cp_sp, file = "tms_cp_sp.RData")

save(tms_tk_u, file = "tms_tk_u.RData")
save(tms_tk_pu, file = "tms_tk_pu.RData")
save(tms_tk_su, file = "tms_tk_su.RData")
save(tms_tk_spu, file = "tms_tk_spu.RData")
save(tms_tk, file = "tms_cp.RData")
save(tms_tk_p, file = "tms_tk_p.RData")
save(tms_tk_s, file = "tms_tk_s.RData")
save(tms_tk_sp, file = "tms_tk_sp.RData")

save(tms_qdfm_p, file = "tms_qdfm_p.RData")
save(tms_qdfm_pc, file = "tms_qdfm_pc.RData")
save(tms_qdfm_pc_t100, file = "tms_qdfm_pc_t100.RData")
save(tms_qdfm_pc_t1000, file = "tms_qdfm_pc_t1000.RData")
save(tms_qdfm_pl, file = "tms_qdfm_pl.RData")
save(tms_qdfm_plc, file = "tms_qdfm_plc.RData")
save(tms_qdfm_plc_t100, file = "tms_qdfm_plc_t100.RData")
save(tms_qdfm_plc_t1000, file = "tms_qdfm_plc_t1000.RData")
save(tms_qdfm_sp, file = "tms_qdfm_sp.RData")
save(tms_qdfm_spc, file = "tms_qdfm_spc.RData")
save(tms_qdfm_spc_t100, file = "tms_qdfm_spc_t100.RData")
save(tms_qdfm_spc_t1000, file = "tms_qdfm_spc_t1000.RData")
save(tms_qdfm_spl, file = "tms_qdfm_spl.RData")
save(tms_qdfm_splc, file = "tms_qdfm_splc.RData")
save(tms_qdfm_splc_t100, file = "tms_qdfm_splc_t100.RData")
save(tms_qdfm_splc_t1000, file = "tms_qdfm_splc_t1000.RData")

save(tms_sdfm_pc, file = "tms_sdfm_pc.RData")
save(tms_sdfm_pc_t100, file = "tms_sdfm_pc_t100.RData")
save(tms_sdfm_pc_t1000, file = "tms_sdfm_pc_t1000.RData")
save(tms_sdfm_spc, file = "tms_sdfm_spc.RData")
save(tms_sdfm_spc_t100, file = "tms_sdfm_spc_t100.RData")
save(tms_sdfm_spc_t1000, file = "tms_sdfm_spc_t1000.RData")
save(tms_sdfm_splc, file = "tms_sdfm_splc.RData")
save(tms_sdfm_splc_t100, file = "tms_sdfm_splc_t100.RData")
save(tms_sdfm_splc_t1000, file = "tms_sdfm_splc_t1000.RData")

##### NONCRISIS CORPUS

### Create consistent text representations with metadata based on TOKENS objects

tmsnc_tk_u <- nc.tokens.punct
docvars(tmsnc_tk_u) <- docvars(nc.tokens.punct)

tmsnc_tk_pu <- nc.tokens
docvars(tmsnc_tk_pu) <- docvars(nc.tokens)

#  tmsnc_tk_su <- spellchecked_punct_tokens
#    docvars(tmsnc_tk_su) <- tmsnc_tx_u[,names(tmsnc_tx_u)!="text"]

#  tmsnc_tk_spu <- spellchecked_tokens
#    docvars(tmsnc_tk_spu) <- tmsnc_tx_u[,names(tmsnc_tx_u)!="text"]

tmsnc_tk <- tokens_tolower(tmsnc_tk_u, keep_acronyms = TRUE)
tmsnc_tk_s <- tokens_tolower(tmsnc_tk_su, keep_acronyms = TRUE)
tmsnc_tk_p <- tokens_tolower(tmsnc_tk_pu, keep_acronyms = TRUE)
tmsnc_tk_sp <- tokens_tolower(tmsnc_tk_spu, keep_acronyms = TRUE)

tmsnc_tk_pl <- tokens_replace(tmsnc_tk_p, pattern = hash_lemma_en$token, replace = hash_lemma_en$lemma, valuetype = "fixed")
tmsnc_tk_spl <- tokens_replace(tmsnc_tk_sp, pattern = hash_lemma_en$token, replace = hash_lemma_en$lemma, valuetype = "fixed")

tmsnc_cp_u <- corpus(vapply(tmsnc_tk_u, paste, FUN.VALUE = character(1), collapse = " "))
docvars(tmsnc_cp_u) <- docvars(tmsnc_tk_u)
tmsnc_cp_pu <- corpus(vapply(tmsnc_tk_pu, paste, FUN.VALUE = character(1), collapse = " "))
docvars(tmsnc_cp_pu) <- docvars(tmsnc_tk_u)
tmsnc_cp_su <- corpus(vapply(tmsnc_tk_su, paste, FUN.VALUE = character(1), collapse = " "))
docvars(tmsnc_cp_su) <- docvars(tmsnc_tk_u)
tmsnc_cp_spu <- corpus(vapply(tmsnc_tk_spu, paste, FUN.VALUE = character(1), collapse = " "))
docvars(tmsnc_cp_spu) <- docvars(tmsnc_tk_u)
tmsnc_cp <- corpus(vapply(tmsnc_tk, paste, FUN.VALUE = character(1), collapse = " "))
docvars(tmsnc_cp) <- docvars(tmsnc_tk_u)
tmsnc_cp_p <- corpus(vapply(tmsnc_tk_p, paste, FUN.VALUE = character(1), collapse = " "))
docvars(tmsnc_cp_p) <- docvars(tmsnc_tk_u)
tmsnc_cp_s <- corpus(vapply(tmsnc_tk_s, paste, FUN.VALUE = character(1), collapse = " "))
docvars(tmsnc_cp_s) <- docvars(tmsnc_tk_u)
tmsnc_cp_sp <- corpus(vapply(tmsnc_tk_sp, paste, FUN.VALUE = character(1), collapse = " "))
docvars(tmsnc_cp_sp) <- docvars(tmsnc_tk_u)

tmsnc_tx_u <- data.frame(docvars(tmsnc_cp_u), text = as.character(tmsnc_cp_u))
tmsnc_tx_pu <- data.frame(docvars(tmsnc_cp_pu), text = as.character(tmsnc_cp_pu))
tmsnc_tx_su <- data.frame(docvars(tmsnc_cp_su), text = as.character(tmsnc_cp_su))
tmsnc_tx_spu <- data.frame(docvars(tmsnc_cp_spu), text = as.character(tmsnc_cp_spu))

tmsnc_tx <- data.frame(docvars(tmsnc_cp), text = as.character(tmsnc_cp))
tmsnc_tx_p <- data.frame(docvars(tmsnc_cp_p), text = as.character(tmsnc_cp_p))
tmsnc_tx_s <- data.frame(docvars(tmsnc_cp_s), text = as.character(tmsnc_cp_s))
tmsnc_tx_sp <- data.frame(docvars(tmsnc_cp_sp), text = as.character(tmsnc_cp_sp))

tmsnc_qdfm_p <- dfm(tmsnc_tk_p)
tmsnc_qdfm_pc <- dfm_remove(tmsnc_qdfm_p, pattern = c(stopwords("en")))
tmsnc_qdfm_sp <- dfm(tmsnc_tk_sp)
tmsnc_qdfm_spc <- dfm_remove(tmsnc_qdfm_sp, pattern = c(stopwords("en")))

tmsnc_qdfm_pl <- dfm(tmsnc_tk_pl)
tmsnc_qdfm_plc <- dfm_remove(tmsnc_qdfm_p, pattern = c(stopwords("en")))
tmsnc_qdfm_spl <- dfm(tmsnc_tk_spl)
tmsnc_qdfm_splc <- dfm_remove(tmsnc_qdfm_sp, pattern = c(stopwords("en")))


tmsnc_qdfm_pc_t100 <- dfm_trim(tmsnc_qdfm_pc, min_docfreq = 0.01, docfreq_type = "prop")
tmsnc_qdfm_pc_t1000 <- dfm_trim(tmsnc_qdfm_pc, min_docfreq = 0.001, docfreq_type = "prop")
tmsnc_qdfm_plc_t100 <- dfm_trim(tmsnc_qdfm_plc, min_docfreq = 0.01, docfreq_type = "prop")
tmsnc_qdfm_plc_t1000 <- dfm_trim(tmsnc_qdfm_plc, min_docfreq = 0.001, docfreq_type = "prop")
tmsnc_qdfm_spc_t100 <- dfm_trim(tmsnc_qdfm_spc, min_docfreq = 0.01, docfreq_type = "prop")
tmsnc_qdfm_spc_t1000 <- dfm_trim(tmsnc_qdfm_spc, min_docfreq = 0.001, docfreq_type = "prop")
tmsnc_qdfm_splc_t100 <- dfm_trim(tmsnc_qdfm_splc, min_docfreq = 0.01, docfreq_type = "prop")
tmsnc_qdfm_splc_t1000 <- dfm_trim(tmsnc_qdfm_splc, min_docfreq = 0.001, docfreq_type = "prop")

tmsnc_sdfm_pc <- convert(tmsnc_qdfm_pc, to = "stm", docvars = docvars(tmsnc_qdfm_pc))
tmsnc_sdfm_pc_t100 <- convert(tmsnc_qdfm_pc_t100, to = "stm", docvars = docvars(tmsnc_qdfm_pc_t100))
tmsnc_sdfm_pc_t1000 <- convert(tmsnc_qdfm_pc_t1000, to = "stm", docvars = docvars(tmsnc_qdfm_pc_t1000))
tmsnc_sdfm_spc <- convert(tmsnc_qdfm_spc, to = "stm", docvars = docvars(tmsnc_qdfm_spc))
tmsnc_sdfm_spc_t100 <- convert(tmsnc_qdfm_spc_t100, to = "stm", docvars = docvars(tmsnc_qdfm_spc_t100))
tmsnc_sdfm_spc_t1000 <- convert(tmsnc_qdfm_spc_t1000, to = "stm", docvars = docvars(tmsnc_qdfm_spc_t1000))
tmsnc_sdfm_splc <- convert(tmsnc_qdfm_spc, to = "stm", docvars = docvars(tmsnc_qdfm_spc))
tmsnc_sdfm_splc_t100 <- convert(tmsnc_qdfm_splc_t100, to = "stm", docvars = docvars(tmsnc_qdfm_splc_t100))
tmsnc_sdfm_splc_t1000 <- convert(tmsnc_qdfm_splc_t1000, to = "stm", docvars = docvars(tmsnc_qdfm_splc_t1000))

### Save the text representations to files

save(tmsnc_tx_u, file = "tmsnc_tx_u.RData")
save(tmsnc_tx_pu, file = "tmsnc_tx_pu.RData")
save(tmsnc_tx_su, file = "tmsnc_tx_su.RData")
save(tmsnc_tx_spu, file = "tmsnc_tx_spu.RData")
save(tmsnc_tx, file = "tmsnc_tx.RData")
save(tmsnc_tx_p, file = "tmsnc_tx_p.RData")
save(tmsnc_tx_s, file = "tmsnc_tx_s.RData")
save(tmsnc_tx_sp, file = "tmsnc_tx_sp.RData")

save(tmsnc_cp_u, file = "tmsnc_cp_u.RData")
save(tmsnc_cp_pu, file = "tmsnc_cp_pu.RData")
save(tmsnc_cp_su, file = "tmsnc_cp_su.RData")
save(tmsnc_cp_spu, file = "tmsnc_cp_spu.RData")
save(tmsnc_cp, file = "tmsnc_cp.RData")
save(tmsnc_cp_p, file = "tmsnc_cp_p.RData")
save(tmsnc_cp_s, file = "tmsnc_cp_s.RData")
save(tmsnc_cp_sp, file = "tmsnc_cp_sp.RData")

save(tmsnc_tk_u, file = "tmsnc_tk_u.RData")
save(tmsnc_tk_pu, file = "tmsnc_tk_pu.RData")
save(tmsnc_tk_su, file = "tmsnc_tk_su.RData")
save(tmsnc_tk_spu, file = "tmsnc_tk_spu.RData")
save(tmsnc_tk, file = "tmsnc_cp.RData")
save(tmsnc_tk_p, file = "tmsnc_tk_p.RData")
save(tmsnc_tk_s, file = "tmsnc_tk_s.RData")
save(tmsnc_tk_sp, file = "tmsnc_tk_sp.RData")

save(tmsnc_qdfm_p, file = "tmsnc_qdfm_p.RData")
save(tmsnc_qdfm_pc, file = "tmsnc_qdfm_pc.RData")
save(tmsnc_qdfm_pc_t100, file = "tmsnc_qdfm_pc_t100.RData")
save(tmsnc_qdfm_pc_t1000, file = "tmsnc_qdfm_pc_t1000.RData")
save(tmsnc_qdfm_pl, file = "tmsnc_qdfm_pl.RData")
save(tmsnc_qdfm_plc, file = "tmsnc_qdfm_plc.RData")
save(tmsnc_qdfm_plc_t100, file = "tmsnc_qdfm_plc_t100.RData")
save(tmsnc_qdfm_plc_t1000, file = "tmsnc_qdfm_plc_t1000.RData")
save(tmsnc_qdfm_sp, file = "tmsnc_qdfm_sp.RData")
save(tmsnc_qdfm_spc, file = "tmsnc_qdfm_spc.RData")
save(tmsnc_qdfm_spc_t100, file = "tmsnc_qdfm_spc_t100.RData")
save(tmsnc_qdfm_spc_t1000, file = "tmsnc_qdfm_spc_t1000.RData")
save(tmsnc_qdfm_spl, file = "tmsnc_qdfm_spl.RData")
save(tmsnc_qdfm_splc, file = "tmsnc_qdfm_splc.RData")
save(tmsnc_qdfm_splc_t100, file = "tmsnc_qdfm_splc_t100.RData")
save(tmsnc_qdfm_splc_t1000, file = "tmsnc_qdfm_splc_t1000.RData")

save(tmsnc_sdfm_pc, file = "tmsnc_sdfm_pc.RData")
save(tmsnc_sdfm_pc_t100, file = "tmsnc_sdfm_pc_t100.RData")
save(tmsnc_sdfm_pc_t1000, file = "tmsnc_sdfm_pc_t1000.RData")
save(tmsnc_sdfm_spc, file = "tmsnc_sdfm_spc.RData")
save(tmsnc_sdfm_spc_t100, file = "tmsnc_sdfm_spc_t100.RData")
save(tmsnc_sdfm_spc_t1000, file = "tmsnc_sdfm_spc_t1000.RData")
save(tmsnc_sdfm_splc, file = "tmsnc_sdfm_splc.RData")
save(tmsnc_sdfm_splc_t100, file = "tmsnc_sdfm_splc_t100.RData")
save(tmsnc_sdfm_splc_t1000, file = "tmsnc_sdfm_splc_t1000.RData")

4 Automated analyses

4.1 Structural topic model

4.1.1 Crisis Corpus

Code
##############
##############
### 187k articles 1788-2020


##############
##############
### GET DATA

  setwd(".\\crisis2\\")
# The split-up lists with character vectors that were used for word substitution
  load("catok01.RData")
  load("catok02.RData")
  load("catok03.RData")
  load("catok04.RData")
  load("catok05.RData")
  load("catok06.RData")
  load("catok07.RData")
  load("catok08.RData")
  load("catok09.RData")
  load("catok10.RData")
  load("catok11.RData")
  load("catok12.RData")
  load("catok13.RData")
  load("catok14.RData")
  load("catok15.RData")
  load("catok16.RData")
  load("catok17.RData")
  load("catok18.RData")
  load("catok19.RData")
# The original 187k tokens objects
  load("tok_Gale+Factiva_1788-2020_exkw.RData")

################
################
### MODIFY DATA

# A character vector of manually-defined stopwords
  sw <- c(stopwords(language="en"),"oclock", "minut", "yard", "morn", "half-past", "dont", "don't", "say", "did", "didn't", "get", "want", "just", "think", "cant", "cannot", "theyr", "wasnt", "wouldnt", "wasn't", "wouldn't", "havent", "haven't", "shouldn't", "should not", "ing", "tion", "con", "com", "pro", "ã‚â£", "ã‚â£m", "â£m", "•", "â–º", "“the", "britain’", "¬", "â", "—")

# Binds together the 19 lists of character vectors  (split for memory management) into one large list (again)
  catok <- c(catok.01,catok.02,catok.03,catok.04,catok.05, catok.06,catok.07,catok.08,catok.09,catok.10, catok.11,catok.12,catok.13,catok.14,catok.15, catok.16,catok.17,catok.18,catok.19)

# Extracts the titles/headlines for the articles.
  text.headlines <- names(lapply(catok,names))

# Creates a simpler list object with named character vectors, necessary for tokens() to work. 
  textraw <- lapply(catok,list_reduce)

# Creates a monolithic tokens object from the list object. From this, a DFM can be constructed.
  tex <- tokens(textraw)

# Before creating a DFM, we add metadata from the original tokens object.
  docvars(tex) <- docvars(atok)
  save(tex,file="tex.RData")

# Make lowercase 
  tex_u <- tokens_tolower(tex)
# Lemmatize
  tex_r <- tokens_replace(tex_u,pattern = lexicon::hash_lemmas$token, replacement = lexicon::hash_lemmas$lemma,valuetype="fixed")
# Remove stopwords
  tex_rs <- tokens_remove(tex_r,pattern = sw,valuetype="fixed")

  # cbind(stm_40_x$vocab,stm_40_x$settings$dim$wcounts$x)

# Construct a document-feature matrix (DFM)
  dx <- dfm(tex_rs)
  save(dx,file="dx.RData")

# Create reduced dfms for topic modeling (only words occurring in more than 100/1000 texts are retained).
  dx1000 <- dfm_trim(dx, min_docfreq = 1000, docfreq_type = "count") 
  ox1000 <- convert(dx1000, to = "stm")
  dx100 <- dfm_trim(dx, min_docfreq = 100, docfreq_type = "count") 
  ox100 <- convert(dx100, to = "stm")
  save(dx1000,file="dx1000.RData")
  save(dx100,file="dx100.RData")
  save(ox1000,file="ox1000.RData")
  save(ox100,file="ox100.RData")

  stm_40_x <- stm(ox1000$documents,ox1000$vocab,K=c(40),init.type="Spectral",emtol=1e-4,seed=5834)

############################
############################
### CRAZIER STUFF
# Create a table of all features with their frequency of occurrence
  word_frequency_table <- data.frame(word=stm_40_x$vocab,count=stm_40_x$settings$dim$wcounts$x)
  # Sort word frequency table by frequency
  word_frequency_table[order(word_frequency_table$count),]
  save(word_frequency_table,file="wft.RData")

  sigword_pointer <- exp(stm_40_x$beta$logbeta[[1]])>.001

  cbind(word_frequency_table[sigword_pointer[1,],])

  round(exp(stm_40_x$beta$logbeta[[1]]),3)[sigword_pointer[1,]])

  rowSums(exp(stm_40_x$beta$logbeta[[1]])>.001)

### Find number of topics K (searchK)

  dx1000@docvars$rand <- runif(0,100,n=187573)

  dx1000_r10 <- dfm_subset(dx1000,rand>90)

  ox1000_r10 <- convert(dx1000_r10, to = "stm")

  sk_10_1 <- searchK(ox1000_r10$documents,ox1000_r10$vocab,K=c(20,40,60,80,100),init.type="Spectral",cores=1,emtol=1e-4,heldout.seed=5834,proportion=.5)
  save(sk_10_1,file="sk_10_1.RData")

  sk_10_2 <- searchK(ox1000_r10$documents,ox1000_r10$vocab,K=c(120,150,200,250,300),init.type="Spectral",cores=1,emtol=1e-4,heldout.seed=5834,proportion=.5)
  save(sk_10_2,file="sk_10_2.RData")

  sk_10_3 <- searchK(ox1000_r10$documents,ox1000_r10$vocab,K=c(190,210,220),init.type="Spectral",cores=1,emtol=1e-4,heldout.seed=5834,proportion=.5)
  save(sk_10_3,file="sk_10_3.RData")

  sk_10_4 <- searchK(ox1000_r10$documents,ox1000_r10$vocab,K=c(230,240),init.type="Spectral",cores=1,emtol=1e-4,heldout.seed=5834,proportion=.5)
  save(sk_10_4,file="sk_10_4.RData")

  K <- unlist(c(sk_10_1$results$K,sk_10_2$results$K,sk_10_3$results$K,sk_10_4$results$K))
  exclus <- unlist(c(sk_10_1$results$exclus,sk_10_2$results$exclus,sk_10_3$results$exclus,sk_10_4$results$exclus))
  semcoh <- unlist(c(sk_10_1$results$semcoh,sk_10_2$results$semcoh,sk_10_3$results$semcoh,sk_10_4$results$semcoh))
  heldout <- unlist(c(sk_10_1$results$heldout,sk_10_2$results$heldout,sk_10_3$results$heldout,sk_10_4$results$heldout))
  residual <- unlist(c(sk_10_1$results$residual,sk_10_2$results$residual,sk_10_3$results$residual,sk_10_4$results$residual))
  bound <- unlist(c(sk_10_1$results$bound,sk_10_2$results$bound,sk_10_3$results$bound,sk_10_4$results$bound))
  lbound <- unlist(c(sk_10_1$results$lbound,sk_10_2$results$lbound,sk_10_3$results$lbound,sk_10_4$results$lbound))
  em.its <- unlist(c(sk_10_1$results$em.its,sk_10_2$results$em.its,sk_10_3$results$em.its,sk_10_4$results$em.its))

  sk10 <- list(results=data.frame(K=K,exclus=exclus,semcoh=semcoh,heldout=heldout,residual=residual,bound=bound,lbound=lbound,em.its=em.its))

  sk10t <- melt(sk10,id.vars=c("K"))

  ggplot(sk10t,aes(x=K,y=value))+geom_point()+geom_line()+facet_wrap(.~variable,scales="free_y")

# Run several alternative models with varying numbers of topics to compare regarding several criteria. 

  sk_20_1 <- searchK(ox1000$documents,ox1000$vocab,K=c(20,40,60,80),init.type="Spectral",cores=1,emtol=1e-4,heldout.seed=5834)
  save(sk_20_1,file="sk_20_1.RData")

  sk_20_2 <- searchK(ox1000$documents,ox1000$vocab,K=c(100),init.type="Spectral",cores=1,emtol=1e-4,heldout.seed=5834)
  save(sk_20_2,file="sk_20_2.RData")

  sk_20_3 <- searchK(ox1000$documents,ox1000$vocab,K=c(120),init.type="Spectral",cores=1,emtol=1e-4,heldout.seed=5834)
  save(sk_20_3,file="sk_20_3.RData")

  sk_20_4 <- searchK(ox1000$documents,ox1000$vocab,K=c(140),init.type="Spectral",cores=1,emtol=1e-4,heldout.seed=5834)
  save(sk_20_4,file="sk_20_4.RData")

  sk_20_5 <- searchK(ox1000$documents,ox1000$vocab,K=c(160),init.type="Spectral",cores=1,emtol=1e-4,heldout.seed=5834)
  save(sk_20_5,file="sk_20_5.RData")

  sk_20_6 <- searchK(ox1000$documents,ox1000$vocab,K=c(180),init.type="Spectral",cores=1,emtol=1e-4,heldout.seed=5834)
  save(sk_20_6,file="sk_20_6.RData")

  sk_20_7 <- searchK(ox1000$documents,ox1000$vocab,K=c(200),init.type="Spectral",cores=1,emtol=1e-4,heldout.seed=5834)
  save(sk_20_7,file="sk_20_7.RData")

  sk_20_9 <- searchK(ox1000$documents,ox1000$vocab,K=c(250),init.type="Spectral",cores=1,emtol=1e-4,heldout.seed=5834)
  save(sk_20_9,file="sk_20_9.RData")

  sk_20_A <- searchK(ox1000$documents,ox1000$vocab,K=c(300),init.type="Spectral",cores=1,emtol=1e-3,heldout.seed=5834)
  save(sk_20_A,file="sk_20_A.RData")

  K <- unlist(c(sk_20$results$K,sk_20_2$results$K,sk_20_3$results$K,sk_20_4$results$K,sk_20_5$results$K,sk_20_6$results$K,sk_20_7$results$K,sk_20_9$results$K))
  exclus <- unlist(c(sk_20$results$exclus,sk_20_2$results$exclus,sk_20_3$results$exclus,sk_20_4$results$exclus,sk_20_5$results$exclus,sk_20_6$results$exclus,sk_20_7$results$exclus,sk_20_9$results$exclus))
  semcoh <- unlist(c(sk_20$results$semcoh,sk_20_2$results$semcoh,sk_20_3$results$semcoh,sk_20_4$results$semcoh,sk_20_5$results$semcoh,sk_20_6$results$semcoh,sk_20_7$results$semcoh,sk_20_9$results$semcoh))
  heldout <- unlist(c(sk_20$results$heldout,sk_20_2$results$heldout,sk_20_3$results$heldout,sk_20_4$results$heldout,sk_20_5$results$heldout,sk_20_6$results$heldout,sk_20_7$results$heldout,sk_20_9$results$heldout))
  residual <- unlist(c(sk_20$results$residual,sk_20_2$results$residual,sk_20_3$results$residual,sk_20_4$results$residual,sk_20_5$results$residual,sk_20_6$results$residual,sk_20_7$results$residual,sk_20_9$results$residual))
  bound <- unlist(c(sk_20$results$bound,sk_20_2$results$bound,sk_20_3$results$bound,sk_20_4$results$bound,sk_20_5$results$bound,sk_20_6$results$bound,sk_20_7$results$bound,sk_20_9$results$bound))
  lbound <- unlist(c(sk_20$results$lbound,sk_20_2$results$lbound,sk_20_3$results$lbound,sk_20_4$results$lbound,sk_20_5$results$lbound,sk_20_6$results$lbound,sk_20_7$results$lbound,sk_20_9$results$lbound))
  em.its <- unlist(c(sk_20$results$em.its,sk_20_2$results$em.its,sk_20_3$results$em.its,sk_20_4$results$em.its,sk_20_5$results$em.its,sk_20_6$results$em.its,sk_20_7$results$em.its,sk_20_9$results$em.its))

  sk20 <- list(results=data.frame(K=K,exclus=exclus,semcoh=semcoh,heldout=heldout,residual=residual,bound=bound,lbound=lbound,em.its=em.its))

  sk20_long <- melt(sk20,id.vars="K")

  ggplot(sk20_long,aes(x=K,y=value,shape=variable,color=variable))+geom_line()+geom_point(fill="white")+facet_wrap(.~variable,scales="free_y")

  sw2 <- c("r", "t", "e", "tho", "s", "o",stm_40_x$vocab[1:233],"bt", "br", "bp", "b", "bo", "becn", "ar", "ap", "anl", "ana", "bv", "c", "ca", "da", "de", "di", "e", "e.c", "ed", "ef", "el", "es", "et", "ex", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "wvar", "wvas", "wve", "wvere", "wvhat", "wvhen", "wvhich", "wvho", "wvill", "wvith", "wvould", "ah", "ai", "much", "mr", "can", "one", "make", "may", "per", "take", "last", "new", "now", "state", "two", "go", "also", "great", "first", "man", "far", "come", "see", "even", "many", "little")

  dx1000@docvars$YEAR <- as.numeric(dx1000@docvars$year)
  ox1000$meta$YEAR <- as.numeric(ox1000$meta$year)
  skd_050 <- searchK(ox1000$documents[!is.na(ox1000$meta$YEAR)],ox1000$vocab,K=c(50),init.type="Spectral",cores=1,emtol=1e-3,heldout.seed=5834,prevalence=~s(ox1000$meta$YEAR))
  save(skd_050,file="skd_050.RData")

  skd_100 <- searchK(ox1000$documents[!is.na(ox1000$meta$YEAR)],ox1000$vocab,K=c(100),init.type="Spectral",cores=1,emtol=1e-3,heldout.seed=5834,prevalence=~s(ox1000$meta$YEAR))
  save(skd_100,file="skd_100.RData")

  skd_150 <- searchK(ox1000$documents[!is.na(ox1000$meta$YEAR)],ox1000$vocab,K=c(150),init.type="Spectral",cores=1,emtol=1e-3,heldout.seed=5834,prevalence=~s(ox1000$meta$YEAR))
  save(skd_150,file="skd_150.RData")

  skd_200 <- searchK(ox1000$documents[!is.na(ox1000$meta$YEAR)],ox1000$vocab,K=c(200),init.type="Spectral",cores=1,emtol=1e-3,heldout.seed=5834,prevalence=~s(ox1000$meta$YEAR))
  save(skd_200,file="skd_200.RData")

  skd_250 <- searchK(ox1000$documents[!is.na(ox1000$meta$YEAR)],ox1000$vocab,K=c(250),init.type="Spectral",cores=1,emtol=1e-3,heldout.seed=5834,prevalence=~s(ox1000$meta$YEAR))
  save(skd_250,file="skd_250.RData")

##############
##############
### The STM with 250 topics 

  oxy <- ox1000$documents[!is.na(ox1000$meta$year)]

  stm250 <- stm(documents=oxy,vocab=ox1000$vocab,K=250,data=ox1000$meta,seed=5834,emtol=1e-5,prevalence=~year,init.type="Spectral")
  save(stm250,file="stm250.RData")

  # STM is length 183239
  # ox1000 is length 183332
  # The difference stems from missing "year" metadata.

  topic_probs <- list()

  for (i in 1:250)
    {
      topic_probs[[i]] <- table(round(stm250$theta[,i],1))
    }

  topics_doc <- data.frame(ox1000$meta[!is.na(ox1000$meta$year),],stm250$theta)

  varnames_doc <- names(topics_doc)

  topics_top <- melt(topics_doc,id.vars=varnames_doc[1:17],measure.vars=varnames_doc[18:267])

  topics_top$DATO <- paste0(topics_top$year,"-",topics_top$Month,"-",topics_top$day)
  topics_top$dato <- as.Date(topics_top$DATO,format="%Y-%m-%d")

  topics_top$jd <- as.POSIXct(topics_top$dato,format="%Y-%m-%d")
  topics_top$j0 <- as.POSIXct("1784-12-31",format="%Y-%m-%d")
  topics_top$days <- difftime(topics_top$jd,topics_top$j0,unit="days")

  topics_ts <- aggregate(topics_top$value,by=list(topics_top$days,topics_top$variable),FUN="sum",drop=FALSE)

  names(topics_ts) <- c("day", "topic", "count")

  total.count <- aggregate(topics_top$value,by=list(topics_top$days),FUN="sum",drop=FALSE)

  topics_ts$total.count <- total.count$x
  topics_ts$share <- topics_ts$count/topics_ts$total.count
  topics_ts$days_num <- as.numeric(topics_ts$day)

#  save(sk10t,file="sk10t.RData")
#  save(topics_ts,file="topics_ts.RData")
#  save(topics_top,file="topics_top.RData")
#  save(topics_doc,file="topics_doc.RData")

  topic.labels <- labelTopics(stm250)
  to.lab <- data.frame(topic=paste0("X",topic.labels$topicnums))
  to.lab[,c("i1", "i2", "i3", "i4", "i5", "i6", "i7")] <- topic.labels$score
  to.lab <- within(to.lab,topic250 <- paste0(i1,"/",i2,"/",i3,"/",i4,"/",i5,"/",i6,"/",i7))

  to.lab$topic250 <- paste(data.frame(t(topic.labels$score)))
  to.lab$topic250 <- stringr::str_remove_all(to.lab$topic250,pattern='[c\\(\\",\\)]')
  to.lab$topic250 <- stringr::str_replace_all(to.lab$topic250,pattern='[[:blank:]]','/')

  to.lab$topic50 <- c(  
            "GEO/GEOPOLITICS", "POL/REBELLION", "ECO/BUSINESS", "PUB/NEWS", "POL/PARLIAMENT", "(invalid)", "(invalid)", "EPI/CHOLERA", "ECO/STOCKEXCHANGE", "GEO/ITALY",
                        "EPI/INFLUENZA", "LEI/FESTIVAL", "ECO/RAW/FOSSIL", "SCI/EVIDENCE", "ECO/BUSINESS", "DIS/SHIP", "PUB/PROBLEM", "EDU/SCHOOL", "GEO/ISRAEL", "(invalid)",
                        "POL/CABINET", "DIS/CONSTRUCTIONS", "(invalid)", "EPI/INFLUENZA", "HEA/HEALTHCARE", "SCI/SPACE", "DIS/COLLAPSE", "ECO/FINANCE", "LEI/TVSHOWS", "(invalid)",
                        "TRA/AIRTRAVEL", "ECO/RAW/FOSSIL", "WEL/PENSIONS", "TRA/OCEANTRAVEL", "GEO/NEGOTIATION", "(invalid)", "GEO/KOREA", "ECO/RAW/METAL", "(invalid)", "ECO/RAW/AGRICULTURE",
                        "TRA/ROADTRAVEL", "GEO/BREXIT", "DIS/MINE", "(invalid)", "ECO/BUSINESS", "DIS/FIRE", "GEO/RUSSIA", "ECO/RAW/ALCOHOL", "WEL/CARE", "GEO/ASIA&LATIN",
                        "EDU/CAREER&HR", "DOM/POLICE", "ECO/INDUSTRY/STEEL", "OTH/HERITAGE", "HEA/CANCER&ILL", "DOM/IMMIGRATION", "(invalid)", "EDU/UNIVERSITY", "GEO/AUSTRALIA&NEWZEALAND", "ECO/REGULATION",
                        "(invalid)", "POL/ELECTION", "EPI/COVID", "HEA/HEALTHCARE", "GEO/AFGHANISTAN&PAKISTAN&MUSLIM", "DOM/INQUIRY", "TRA/RAILTRAVEL", "(invalid)", "DIS/SHIPWRECK", "GEO/GERMANY",
                        "ECO/FINANCE", "(invalid)", "DOM/CRIME", "POL/COMMITTEE", "(invalid)", "GEO/USA", "(invalid)", "REL/CHURCH", "TRA/CARGO", "MIL/NAVY",
                        "ECO/CURRENCY", "GEO/ARABIA", "ECO/FINANCE", "GEO/ENGLAND", "GEO/EU", "LEI/FOOTBALL", "GEO/CANADA&NIGERIA", "DOM/JUSTICE", "EPI/FEVER", "ECO/NATIONAL",
                        "DIS/EMERGENCY", "POL/CONSTITUTION", "LEI/ART&EXHIBITION", "ECO/BUSINESS", "LEI/CHRISTMAS&TOYS", "GEO/IRELAND", "ECO/FINANCE", "ECO/RAW/AGRICULTURE", "INF/CONSTRUCTION", "(invalid)", # "POL/CONSTITUTION" also includes "GEO/HUNGARY"; "ECO/BUSINESS" includes "ECO/STOCKEXCHANGE"
                        "GEO/CHINA", "POL/STATEMENT", "LAB/STRIKE", "DOM/CRIME", "NRG/NUCLEAR", "DOM/TERRORISM", "GEO/COLONIES&EMPIRE", "DIS/STORM", "LEI/THEATER", "PUB/NEWS",
                        "ECO/FINANCE", "GEO/INDIA", "REL/CHURCH", "SCI/VACCINE", "GEO/FRANCE", "WEL/POVERTY", "ECO/INDUSTRY/PHARMA", "GEO/BRITAIN", "LEI/PETS", "GEO/TURKEY&BULGARIA",
                        "GEO/ALLIANCE", "ENV/PLASTIC", "(invalid)", "TRA/AIRTRAVEL", "ECO/RAW/AGRICULTURE", "GEO/IRAQ&HUMANITARIAN", "TRA/ROADTRAVEL", "(invalid)", "GEO/EU", "GEO/AUSTRIA",
                        "GEO/PORTUGAL", "POL/ROYAL", "ECO/BUSINESS", "(invalid)", "ECO/NATIONAL", "INF/CONSTRUCTION", "ECO/RAW/AGRICULTURE", "GEO/REGIONS", "WEL/HOUSING", "LEI/MOVIES&POP",
                        "ECO/INDUSTRY/TEXTILE", "WEL/PEASANTRY", "FAM/MARRIAGE", "(invalid)", "POL/SYSTEM", "PUB/ASSOCIATIONS", "GEO/USA&UK&CUBA", "GEO/SOVIET", "WEL/HOUSING", "LEI/OLYMPICS",
                        "ECO/RAW/AGRICULTURE", "DIS/EARTHQUAKE", "DOM/CRIME", "GEO/ARGENTINA", "ECO/BUSINESS", "(invalid)", "GEO/SYRIA", "GEO/BRITAIN", "(invalid)", "ECO/CURRENCY",
                        "REL/RELIGION", "POL/PARTYPOLITICS", "GEO/RUSSIA", "LAB/UNEMPLOYMENT", "ECO/FINANCE", "MIL/ARMY", "WEL/CHARITY&AID", "ECO/RAW/FISHERY&COAST", "LEI/LITERATURE", "ECO/BUSINESS",
                        "LEI/CRICKET", "EDU/UNIVERSITY", "HEA/HEALTHYLIFESTYLE", "INF/TELECOM", "LEI/MOVIES", "(invalid)", "POL/ROYAL", "ECO/BUSINESS", "POL/PM", "ENV/WOOD&TREES",
                        "TRA/RAILTRAVEL", "ECO/SERVICE/RETAIL", "POL/PM", "(invalid)", "GEO/BRITAIN", "ECO/FINANCE", "GEO/BRITAIN", "ECO/RAW/FOSSIL", "ECO/REGULATION", "ECO/NATIONAL",
                        "POL/USPRESIDENT", "LEI/MOVIES", "POL/PM", "GEO/SCOTLAND", "POL/PARLIAMENT", "GEO/SOUTHAFRICA", "ECO/RAW/METAL", "LEI/TOURISM&RESTAURANTS", "PUB/EVENTS", "SCI/IT",
                        "PUB/OBITUARY", "TRA/OCEANTRAVEL", "GEO/USA", "MIL/ENEMY&BATTLE", "DIS/FLOOD", "DOM/CRIME", "WEL/POVERTY", "INF/CONSTRUCTION", "GEO/IBERIA", "ENV/CLIMATE",
                        "POL/POLICYMAKING", "EDU/SCHOOL", "GEO/MIDDLEEAST", "GEO/GREECE&TURKEY", "ECO/INDUSTRY/CAR", "LAB/WAGES", "GEO/IRAN", "(invalid)", "GEO/EGYPT&SUEZ", "ECO/RAW/TOBACCO",
                        "DIS/EMERGENCY", "PUB/NEWS", "ENV/POLLUTION", "HEA/DRUG&MEDICINE", "GEO/SCANDINAVIA", "LEI/FASHION", "POL/PM", "WEL/FAMINE&POVERTY", "GEO/SOUTHAMERICA", "POL/PROTEST",
                        "DIS/CATASTROPHE", "(invalid)", "ECO/STOCKEXCHANGE", "POL/PARLIAMENT", "GEO/SOUTHEASTASIA", "ECO/NATIONAL", "DIS/OILRIG", "(invalid)", "GEO/SWITZERLAND", "FAM/CHILDREN",
                        "ENV/CLIMATE", "PUB/OBITUARY", "PUB/BBC", "POL/INVESTIGATION", "ECO/BUSINESS", "TRA/AIRTRAVEL", "POL/STATEMENT", "GEO/VIETNAM", "PUB/PUBLICATION", "OTH/RISK")
                        
  to.lab$topic20 <- str_extract(to.lab$topic50,"^[:alnum:]{3,3}")
                        
  topics_top$to250 <- to.lab[match(topics_top$variable,to.lab$topic),"topic250"]
  topics_top$to50 <- to.lab[match(topics_top$variable,to.lab$topic),"topic50"]
  topics_top$to20 <- to.lab[match(topics_top$variable,to.lab$topic),"topic20"]


  topics_top$months <- mondf(topics_top$j0,topics_top$jd)
  topics_top$years <- yeardf(topics_top$j0,topics_top$jd)
  topics_top$day_num <- round(as.numeric(topics_top$days),0)

  topics_top$month_num <- round(as.numeric(topics_top$months),0)
  topics_top$year_num <- round(as.numeric(topics_top$years),0)


  topics_ts$to250 <- to.lab[match(topics_ts$topic,to.lab$topic),"topic250"]
  topics_ts$to50 <- to.lab[match(topics_ts$topic,to.lab$topic),"topic50"]
  topics_ts$to20 <- to.lab[match(topics_ts$topic,to.lab$topic),"topic20"]

# Create daily topic time series

  to250_day <- aggregate(topics_top$value,by=list(topics_top$day_num,topics_top$to250),FUN="sum",drop=FALSE)
  to50_day <- aggregate(topics_top$value,by=list(topics_top$day_num,topics_top$to50),FUN="sum",drop=FALSE)
  to20_day <- aggregate(topics_top$value,by=list(topics_top$day_num,topics_top$to20),FUN="sum",drop=FALSE)

  names(to250_day) <- c("day", "topic", "count")
  names(to50_day) <- c("day", "topic", "count")
  names(to20_day) <- c("day", "topic", "count")

# Create monthly topic time series

  to250_month <- aggregate(topics_top$value,by=list(topics_top$month_num,topics_top$to250),FUN="sum",drop=FALSE)
  to50_month <- aggregate(topics_top$value,by=list(topics_top$month_num,topics_top$to50),FUN="sum",drop=FALSE)
  to20_month <- aggregate(topics_top$value,by=list(topics_top$month_num,topics_top$to20),FUN="sum",drop=FALSE)

  names(to250_year) <- c("year", "topic", "count")
  names(to50_year) <- c("year", "topic", "count")
  names(to20_year) <- c("year", "topic", "count")

# Create yearly topic time series

  to250_year <- aggregate(topics_top$value,by=list(topics_top$year_num,topics_top$to250),FUN="sum",drop=FALSE)
  to50_year <- aggregate(topics_top$value,by=list(topics_top$year_num,topics_top$to50),FUN="sum",drop=FALSE)
  to20_year <- aggregate(topics_top$value,by=list(topics_top$year_num,topics_top$to20),FUN="sum",drop=FALSE)


# Daily article count (valid topics only)

  total.count.day <- aggregate(subset(topics_top,to50!="(invalid)")$value,by=list(subset(topics_top,to50!="(invalid)")$days),FUN="sum",drop=FALSE)

# Monthly article count (valid topics only)

  total.count.month <- aggregate(subset(topics_top,to50!="(invalid)")$value,by=list(subset(topics_top,to50!="(invalid)")$month_num),FUN="sum",drop=FALSE)

# Yearly article count (valid topics only)

  total.count.year <- aggregate(subset(topics_top,to50!="(invalid)")$value,by=list(subset(topics_top,to50!="(invalid)")$year_num),FUN="sum",drop=FALSE)

# Put daily, monthly, yearly count into topic-specific time series

  to250_day$total.count <- total.count.day$x
  to50_day$total.count <- total.count.day$x
  to20_day$total.count <- total.count.day$x
  to250_month$total.count <- total.count.month$x
  to50_month$total.count <- total.count.month$x
  to20_month$total.count <- total.count.month$x
  to250_year$total.count <- total.count.year$x
  to50_year$total.count <- total.count.year$x
  to20_year$total.count <- total.count.year$x

# Compute daily shares of topics

  to250_day$share <- to250_day$count/to250_day$total.count
  to50_day$share <- to50_day$count/to50_day$total.count
  to20_day$share <- to20_day$count/to20_day$total.count
  to250_month$share <- to250_month$count/to250_month$total.count
  to50_month$share <- to50_month$count/to50_month$total.count
  to20_month$share <- to20_month$count/to20_month$total.count
  to250_year$share <- to250_year$count/to250_year$total.count
  to50_year$share <- to50_year$count/to50_year$total.count
  to20_year$share <- to20_year$count/to20_year$total.count

  to20_year$dyear <- (to20_year$year+1784)
  to50_year$dyear <- (to50_year$year+1784)

# Plot yearly shares of highest-order topics

  ggplot(to20_year,aes(x=dyear,y=share))+geom_smooth()+geom_point(size=0.25)+facet_wrap(.~topic,scales="free_y")

  # save(to20_year,file="to20_year.RData")
  # save(to50_year,file="to50_year.RData")
  # save(to250_year,file="to250_year.RData")
  # save(to20_month,file="to20_month.RData")
  # save(to50_month,file="to50_month.RData")
  # save(to250_month,file="to250_month.RData")
  # save(to20_day,file="to20_day.RData")
  # save(to50_day,file="to50_day.RData")
  # save(to250_day,file="to250_day.RData")

4.1.2 Noncrisis corpus

Code
#### NC STM

library(quanteda)
library(stm)
library(tidyverse)
library(stringr)
library(babynames)
library(lexicon)
library(wru)
library(pbapply)

setwd("m://users//stefange//onedrive - ntnu//2-data//crisis_non//")

### Converting the [quanteda] DFM into [stm]'s DFM format

sw <- c(
  "__", "___", "0", "00", "000", "01", "02", "03", "04", "05", "07", "0o", "0t", "1", "1,000",
  "1,500", "1.1", "10", "10,000", "100", "100,000", "1001", "101", "102", "103",
  "104", "105", "106", "107", "108", "109", "10s", "10th", "11", "110",
  "111", "112", "113", "114", "115", "116", "117", "118", "119", "11th",
  "12", "120", "121", "122", "123", "124", "125", "127", "12s", "12th",
  "13", "130", "131", "132", "133", "134", "135", "13s", "13th", "14",
  "140", "141", "145", "14s", "14th", "15", "150", "151", "155", "15s", "15th",
  "16", "160", "161", "16s", "16th", "17", "170", "171", "17s", "17th", "18", "180", "181", "1870",
  "18s", "18th", "19", "190", "191",
  "1910", "19th", "1a", "1b", "1c",
  "1d", "1e", "1f", "1h", "1i",
  "1l", "1m", "1n", "1o", "1r",
  "1s", "1st", "1t", "1th", "1v",
  "1w", "1y", "2", "2,000", "2.30",
  "20", "20,000", "200", "201", "20s",
  "20th", "21", "210", "211", "212",
  "21s", "21st", "22", "220", "22d",
  "23", "230", "23d", "24", "24s",
  "24th", "25", "250", "25s", "25th",
  "26", "26th", "27", "27th", "28",
  "28th", "29", "29th", "2a", "2d",
  "2i", "2l", "2nd", "2o", "2r",
  "2s", "2t", "3", "3,000", "30",
  "300", "30s", "30th", "31", "31r",
  "31st", "32", "33", "34", "35",
  "350", "36", "37", "38", "39",
  "3a", "3d", "3i", "3ir", "3l",
  "3mr", "3o", "3r", "3rd", "3s",
  "3t", "4", "4,000", "40", "400",
  "40s", "41", "42", "43", "44",
  "45", "46", "47", "48", "49",
  "4a", "4d", "4i", "4l", "4o",
  "4s", "4t", "4th", "5", "5,000",
  "50", "500", "51", "52", "53",
  "54", "55", "56", "57", "58",
  "59", "5d", "5i", "5s", "5t",
  "5th", "6", "60", "600", "61",
  "62", "63", "64", "65", "66",
  "67", "68", "69", "6d", "6f",
  "6i", "6s", "6t", "6th", "7",
  "7.30", "70", "700", "71", "72",
  "73", "74", "75", "76", "77",
  "78", "79", "7d", "7s", "7th",
  "8", "80", "800", "81", "82",
  "83", "84", "85", "86", "87",
  "88", "89", "8d", "8s", "8th",
  "9", "90", "91", "92", "93",
  "94", "95", "96", "97", "98",
  "99", "9d", "9s", "9th", "a",
  "â", "ã", "a.d", "a.m", "a1",
  "a2", "a3", "a4", "a5", "aa",
  "aâ", "aad", "ab", "oclock", "minut", "yard", "morn", "half-past", "dont", "don't", "say", "did", "didn't", "get", "want", "just", "think", "cant", "cannot", "theyr", "wasnt", "wouldnt", "wasn't", "wouldn't", "havent", "haven't", "shouldn't", "ing", "tion", "con", "com", "pro", "ã‚â£", "ã‚â£m", "â£m", "•", "â–º", "“the", "britain’", "¬", "â", "â€", "~", "ã", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "B", "C", "D", "E", "F", "G", "H", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "ll", "^", "th", "ot", "15", "20", "te", ">", "id", "il", "´", "`", "la", "ii", "en", "al", "tbe", "re", "lt", "li", "ar", "fo", "thie", "°", "aud", "le", "es", "ss", "od", "bo", "ir", "tho"
)

swen <- c(sw, stopwords("english"))

nc.text <- nc.r[!duplicated(nc.r$id), ]
nc.corpus <- corpus(nc.text)
nc.tok <- tokens(nc.corpus, remove_punct = TRUE)

nc.tok %>%
  tokens_remove(pattern = swen) -> nc.tok.sw
docvars(nc.tok.sw)$Corpus <- "Noncrisis"

cc.tokens %>%
  tokens_remove(pattern = swen) -> cc.tok.sw
docvars(cc.tok.sw)$Corpus <- "Crisis"

names(cc.tok.sw)[names(cc.tok.sw) %in% names(nc.tok.sw)] <- paste0(names(cc.tok.sw)[names(cc.tok.sw) %in% names(nc.tok.sw)], "_A001")

tt.tok.sw <- c(cc.tok.sw, nc.tok.sw)

nc.dtm <- dfm(tokens_remove(nc.tok, pattern = c(stopwords("en"), sw)))
docvars(nc.dtm)$Corpus <- "Noncrisis"

nc.dtm %>%
  dfm_trim(min_docfreq = 1000, docfreq_type = "count") %>%
  dfm_remove(pattern = sw) %>%
  dfm_subset(!is.na(year)) %>%
  convert(to = "stm") -> nc.out

endict <- read.csv(file = "ENGVOC_lo.txt")
endict <- as.character(endict[, 1]) # ensure that the vocabulary list is represented as character vector.
person.names <- as.matrix(babynames[, 3])
person_names <- tolower(unique(person.names))
surnames <- tolower(surnames2010[, 1])

wordlist <- c(endict, person_names, surnames)

anyMatch <- function(d) {
  re <- grepl(x = wordlist, pattern = d)
  ret <- sum(re)
  retu <- ret > 0
  return(retu)
}

words <- pblapply(X = nc.out$vocab, FUN = anyMatch)
en_words <- nc.out$vocab[as.logical(words)]
en_nonwords <- nc.out$vocab[!as.logical(words)]

nc.dtm <- dfm(tokens_remove(nc.tok, pattern = c(en_nonwords, stopwords("en"), sw)))
docvars(nc.dtm)$Corpus <- "Noncrisis"

nc.dtm %>%
  dfm_trim(min_docfreq = 1000, docfreq_type = "count") %>%
  dfm_remove(pattern = sw) %>%
  dfm_subset(!is.na(year)) %>%
  convert(to = "stm") -> nc.out2

### Conduct a stm with 250 topics with time as covariate

nc_STM <- stm(documents = nc.out2$documents, vocab = nc.out2$vocab, K = 250, data = nc.out2$meta, init.type = "Spectral", verbose = TRUE, prevalence = ~year)

# save(nc_STM,file="nc_STM.RData")

### Create a wide and a long version of the data

nc_STM_W <- data.frame(nc_STM$theta, nc.out$meta)

nc_STM_L <- pivot_longer(nc_STM_W, paste0("X", 1:250))

### Inspect STM solution

nto <- paste0("X", 1:250)

lto <- labelTopics(nc_STM)
lto7 <- tibble(data.frame(lto[1]))
to7 <- within(lto7, top7 <- paste(topics.1, "/", topics.2, "/", topics.3, "/", topics.4, "/", topics.5, "/", topics.6, "/", topics.7))

topic.cat <- c(
  "NA/time", "MIL/ENEMY&BATTLE", "GEO/IRELAND", "POL/ELECTION", "NA/NA", "NA/NA", "NA/NA", "NA/NA", "PUB/NEWS", "NA/color",
  "GEO/AUSTRALIA&NEWZEALAND", "NA/time", "PUB/PUBLICATION", "GEO/SCANDINAVIA", "NA/mix", "NA/NA", "ECO/RAW/ALCOHOL", "ECO/BUSINESS", "NA/NA", "ECO/MARKET",
  "ECO/INDUSTRY/TEXTILE", "NA/NA", "NA/NA", "NA/NA", "ECO/TAX", "ECO/RAW/FOSSIL", "NA/time", "NA/NA", "NA/directions", "NA/NA",
  "ECO/BUSINESS", "NA/NA", "NA/say", "NA/NA", "PUB/NEWS", "GEO/SOUTHAMERICA", "NA/time", "INF/COMMUNICATION", "PUB/OBITUARY", "NA/time",
  "PUB/EVENT", "NA/NA", "NA/positive", "ECO/RAW", "GEO/GERMANY", "GEO", "REL", "REL", "DOM/JUSTICE", "NA/NA",
  "NA/NA", "GEO/ENGLAND", "NA/time", "LOC/ENGLAND", "ECO/BUSINESS", "NA/NA", "NA/NA", "NA/NA", "NA/NA", "NA/time",
  "TRA/OCEANTRAVEL", "NA/time", "MIL/ARMY", "ECO/STOCKEXCHANGE", "NA/family", "NA/NA", "ECO/MARKET", "NA/NA", "ECO/BUSINESS", "NA/NA",
  "DOM/ADMINISTRATION", "GEO/ENGLAND", "MIL/NAVY", "NA/time", "PUB/OBITUARY", "ECO/BUSINESS", "DOM/JUSTICE", "ECO/MARKET", "DOM/JUSTICE", "ECO/REALESTATE",
  "NA/pronouns", "ECO/MARKET", "ECO/RAW", "ECO/INDUSTRY/CAR", "ECO/REALESTATE", "NA/time", "EDU/SCHOOL", "ECO/RAW/AGRICULTURE", "ECO/RAW/AGRICULTURE", "NA/NA",
  "GEO/ENGLAND", "NA/NA", "ECO/INDUSTRY/CAR", "ECO/RAW/AGRICULTURE", "NA/NA", "NA/NA", "NA/directions", "FAM", "TRA/RAILTRAVEL", "POL/USPRESIDENT",
  "ECO/FINANCE", "NA/NA", "ECO/BUSINESS", "NA/names", "NA/NA", "ECO/BUSINESS", "WEL/CHARITY&AID", "NA/NA", "ECO/RAW/AGRICULTURE", "NA/time",
  "NA/NA", "ECO/FINANCE", "PUB/NEWS", "NA/NA", "GEO/BRITAIN", "NA/NA", "TRA/OCEANTRAVEL", "NA/NA", "ECO/RAW/FOSSIL", "NA/time", # 120
  "ECO/REALESTATE", "NA/NA", "NA/NA", "DOM/JUSTICE", "LEI/ART&EXHIBITION", "POL/PARLIAMENT", "GEO/USA", "ECO/RAW/FOSSIL", "NA/time", "ECO/MARKET",
  "ECO/NATIONAL", "POL/ELECTION", "ECO/RAW/AGRICULTURE", "ECO/RAW/AGRICULTURE", "ECO/STOCKEXCHANGE", "NA/names", "MIL/ENEMY&BATTLE", "NA/location", "ECO/MARKET", "ECO/INDUSTRY/STEEL",
  "ECO/REGULATION", "HEA/HEALTHCARE", "ECO/RAW/AGRICULTURE", "NA/NA", "NA/time", "TRA/OCEANTRAVEL", "NA/NA", "NA/NA", "NA/NA", "LAB/UNEMPLOYMENT", # 150
  "ECO/RAW/FOSSIL", "NA/NA", "GEO/BRITAIN", "ECO/BUSINESS", "LEI/THEATER", "LAB/UNEMPLOYMENT", "INF/COMMUNICATION", "ECO/MARKET", "NA/NA", "ECO/STOCKEXCHANGE",
  "NA/NA", "NA/time", "ECO/RAW/AGRICULTURE", "MIL/ARMY", "NA/NA", "NA/NA", "NA/NA", "NA/NA", "NA/NA", "NA/NA",
  "NA/NA", "NA/time", "NA/NA", "NA/NA", "NA/titles", "NA/time", "ECO/BUSINESS", "ECO/FINANCE", "ECO/BUSINESS", "NA/NA",
  "ECO/FINANCE", "MIL/ARMY", "NA/NA", "NA/title", "NA/NA", "TRA/RAILTRAVEL", "LEI/FESTIVAL", "ECO/BUSINESS", "TRA/AIRTRAVEL", "DOM/JUSTICE",
  "TRA/OCEANTRAVEL", "NA/time", "NA/NA", "NA/NA", "MIL/ARMY", "NA/NA", "TRA/OCEANTRAVEL", "ECO/NATIONAL", "NA/NA", "ECO/BUSINESS",
  "EDU/SCHOOL", "NA/NA", "NA/NA", "NA/NA", "POL/COMMITTEE", "GEO/INDIA", "LEI/TOURISM&RESTAURANTS", "WEL/POVERTY", "LEI/TOURISM&RESTAURANTS", "DOM/JUSTICE",
  "NA/NA", "POL/PARLIAMENT", "NA/title", "ECO/NATIONAL", "ECO/MARKET", "ECO/BUSINESS", "FAM/MARRIAGE", "LAB/STRIKE", "LEI/FOOTBALL", "NA/NA",
  "HEA/HEALTHCARE", "GEO/ENGLAND", "DOM/JUSTICE", "MIL/NAVY", "ECO/RAW/METAL", "ECO/STOCKEXCHANGE", "NA/NA", "NA/time", "ECO/BUSINESS", "NA/NA",
  "PUB/NEWS", "ECO/RAW/AGRICULTURE", "POL/ROYAL", "LEI/TOURISM&RESTAURANTS", "NA/NA", "NA/NA", "NA/NA", "LEI/CLUBS", "ECO/RAW/AGRICULTURE", "NA/titles",
  "GEO/GEOPOLITICS", "NA/NA", "NA/NA", "NA/time", "GEO/EU", "MIL/NAVY", "NA/NA", "NA/NA", "EDU/UNIVERSITY", "NA/time"
)

nc_outx <- alignCorpus(new = list(documents = nc.out2$documents, vocab = nc.out2$vocab, meta = nc.out2$meta), old.vocab = stm250$vocab)

# nc_stmx <- fitNewDocuments(model=stm250,documents=nc_outx$documents,newData=nc_outx$meta$year,origData=ox1000$meta$year,prevalence=~year)

nc_stmx <- fitNewDocuments(model = stm250, documents = nc_outx$documents, newData = nc_outx$meta, origData = ox1000$meta, prevalence = ~year)

save(nc_stmx, file = "nc_stmx.RData")

# save (nc_outx,file="nc_outx.RData")
# save (nc_stmx,file="nc_stmx.RData")

nc_STMx_W <- data.frame(nc_stmx$theta, nc_outx$meta)

nc_STMx_L <- pivot_longer(nc_STMx_W, paste0("X", 1:250))

nc_STMx_L$topic20 <- to.lab[match(nc_STMx_L$name, to.lab$topic), "topic20"]
nc_STMx_L$topic50 <- to.lab[match(nc_STMx_L$name, to.lab$topic), "topic50"]
nc_STMx_L$topic250 <- to.lab[match(nc_STMx_L$name, to.lab$topic), "topic250"]





nc_STMx_L$decade <- floor(as.numeric(nc_STMx_L$year) / 10) * 10

decade_nc_STM20 <- aggregate(nc_STMx_L$topic20, by = list(year = nc_STMx_L$decade, topic = nc_STMx_L$name), FUN = "sum", na.rm = TRUE)
decade_nc_STM <- aggregate(nc_STMx_L$value, by = list(year = nc_STMx_L$decade, topic = nc_STMx_L$name), FUN = "sum", na.rm = TRUE)


year_nc_STM <- aggregate(nc_STMx_L$value, by = list(year = nc_STMx_L$year, topic = nc_STMx_L$name), FUN = "sum", na.rm = TRUE)
year_nc_STM$total <- rep(aggregate(nc_STMx_L$value, by = list(year = nc_STMx_L$year), FUN = "sum", na.rm = TRUE)$x, times = 250)
year_nc_STM$share <- year_nc_STM$x / year_nc_STM$total

year_nc_STMx_W <- pivot_wider(year_nc_STM, id_cols = c("year"), names_from = c("topic"), values_from = c("share"))

year_nc_STMx_W$active <- rowSums(year_nc_STMx_W[, 2:251] > 0.001, na.rm = TRUE)
year_nc_STMx_W$active2 <- rowSums(year_nc_STMx_W[, 2:251] > 0.01, na.rm = TRUE)

year_nc_STMx_W$gini <- rowwise(year_nc_STMx_W[, 2:251]) %>% summarise(gini = Gini(c_across()))

year_nc_STMx_W$r.gini <- 1 - year_nc_STMx_W$gini

df.gini <- data.frame(year = as.Date(year_nc_STMx_W$year, format = "%Y"), rgini = year_nc_STMx_W$r.gini)
ggplot(df.gini, aes(x = year, y = gini)) +
  geom_point()




year_nc_STM50 <- aggregate(nc_STMx_L$value, by = list(year = nc_STMx_L$year, topic = nc_STMx_L$topic50), FUN = "sum", na.rm = TRUE)
year_nc_STM50$total <- rep(aggregate(nc_STMx_L$value, by = list(year = nc_STMx_L$year), FUN = "sum", na.rm = TRUE)$x, times = length(unique(to.lab$topic50)))
year_nc_STM50$share <- year_nc_STM50$x / year_nc_STM50$total

year_nc_STMx_W50 <- pivot_wider(year_nc_STM50, id_cols = c("year"), names_from = c("topic"), values_from = c("share"))

year_nc_STMx_W50$active <- rowSums(year_nc_STMx_W50[, 2:163] > 0.001, na.rm = TRUE)
year_nc_STMx_W50$active2 <- rowSums(year_nc_STMx_W50[, 2:163] > 0.01, na.rm = TRUE)


year_nc_STMx_W50$gini <- rowwise(year_nc_STMx_W50[, 2:163]) %>% summarise(gini = Gini(c_across()))

year_nc_STMx_W50$r.gini <- 1 - year_nc_STMx_W50$gini

df.gini50 <- data.frame(year = as.Date(year_nc_STMx_W50$year, format = "%Y"), rgini = year_nc_STMx_W50$r.gini)
ggplot(df.gini50, aes(x = year, y = gini)) +
  geom_point()



year_nc_STM20 <- aggregate(nc_STMx_L$value, by = list(year = nc_STMx_L$year, topic = nc_STMx_L$topic20), FUN = "sum", na.rm = TRUE)
year_nc_STM20$total <- rep(aggregate(nc_STMx_L$value, by = list(year = nc_STMx_L$year), FUN = "sum", na.rm = TRUE)$x, times = length(unique(to.lab$topic20)) - 1)
year_nc_STM20$share <- year_nc_STM20$x / year_nc_STM20$total


year_nc_STMx_W20 <- pivot_wider(year_nc_STM20, id_cols = c("year"), names_from = c("topic"), values_from = c("share"))

year_nc_STMx_W20$active <- rowSums(year_nc_STMx_W20[, 2:22] > 0.001, na.rm = TRUE)
year_nc_STMx_W20$active2 <- rowSums(year_nc_STMx_W20[, 2:22] > 0.01, na.rm = TRUE)

year_nc_STMx_W20$gini <- rowwise(year_nc_STMx_W20[, 2:22]) %>% summarise(gini = Gini(c_across()))

year_nc_STMx_W20$r.gini <- 1 - year_nc_STMx_W20$gini

ggplot(year_nc_STM20, aes(x = as.Date(year, format = "%Y"), y = share, fill = topic)) +
  geom_area()
ggplot(year_nc_STM50, aes(x = as.Date(year, format = "%Y"), y = share, fill = topic)) +
  geom_area() +
  theme(legend.position = "none")
ggplot(year_nc_STM250, aes(x = as.Date(year, format = "%Y"), y = share, fill = topic)) +
  geom_area() +
  theme(legend.position = "none")


df.gini20 <- data.frame(year = year_nc_STMx_W20$year, rgini = year_nc_STMx_W20$r.gini)
ggplot(df.gini20, aes(x = as.Date(year, format = "%Y"), y = gini)) +
  geom_point()

df.gini$topic250 <- df.gini$gini
df.gini$topic50 <- df.gini50$gini
df.gini$topic20 <- df.gini20$gini

df.gini_L <- pivot_longer(df.gini, c("topic250", "topic50", "topic20"))

routine_gini <- ggplot(df.gini_L, aes(y = value, shape = name, group = name, color = name, fill = name, x = as.Date(year, format = "%Y"))) +
  geom_point() +
  geom_smooth() +
  scale_color_viridis_d() +
  scale_fill_viridis_d() +
  theme_bluewhite() +
  ylim(0, 1) +
  theme(legend.position = c(0.75, 0.75)) +
  xlab("Year") +
  ylab("Diversity (1-Gini)") +
  ggtitle("Topic diversity (1-Gini) in routine news coverage 1785-2014")

df.active_L <- data.frame(year = rep(1785:2014, times = 3), active2 = c(year_nc_STMx_W$active2 / 250, year_nc_STMx_W50$active2 / 162, year_nc_STMx_W20$active2 / 20), active = c(year_nc_STMx_W$active / 250, year_nc_STMx_W50$active / 162, year_nc_STMx_W20$active / 21), resolution = rep(c("250 topics", "163 topic complexes", "21 topic areas"), each = 230))

routine_active <- ggplot(df.active_L, aes(y = active * 100, shape = resolution, group = resolution, color = resolution, fill = resolution, x = year)) +
  geom_point() +
  geom_smooth() +
  scale_color_viridis_d() +
  scale_fill_viridis_d() +
  theme_bluewhite() +
  ylim(0, 100) +
  theme(legend.position = c(0.25, 0.25)) +
  xlab("Year") +
  ylab("Share of active topics (%)") +
  ggtitle("Active topics in routine news coverage 1785-2014")

ggplot(df.active_L, aes(y = active2, shape = resolution, group = resolution, color = resolution, fill = resolution, x = year)) +
  geom_point() +
  geom_smooth() +
  scale_color_viridis_d() +
  scale_fill_viridis_d() +
  theme_bluewhite()

ggplot(year_nc_STM20, aes(x = as.Date(year, format = "%Y"), y = share, fill = topic)) +
  geom_area()
ggplot(year_nc_STM50, aes(x = as.Date(year, format = "%Y"), y = share, fill = topic)) +
  geom_area() +
  theme(legend.position = "none")
ggplot(year_nc_STM250, aes(x = as.Date(year, format = "%Y"), y = share, fill = topic)) +
  geom_area() +
  theme(legend.position = "none")

ggsave(routine_active, file = "gg_routine_active.svg", device = "svg", unit = "cm", width = 16, height = 16, dpi = 1200, scale = 1.25)
ggsave(routine_gini, file = "gg_routine_gini.svg", device = "svg", unit = "cm", width = 16, height = 16, dpi = 1200, scale = 1.25)

df.active_L -> df_routine.active_L
df.gini_L -> df_routine.gini_L

4.2 Crisis news wave detection and labelling

Code
########## ESCC-spike extractor

#  setwd(".//crisis2")

  load(".//crisis//t250d.RData")
  load(".//crisis//to_lab.RData")
  load(".//crisis//thetimes.RData")
  load(".//crisis//dx1000.RData")
  load(".//crisis//dx100.RData")
  load(".//crisis//dx.RData")
  load(".//crisis//topics_doc.RData")

# In long for loops, to display only every 1000th loop as progress indicator
  thousands <- seq(1000,dim(t250d)[1]/length(unique(t250d$topic)),1000)

# Define stopwords
  sw <- c(stopwords(language="en"),"oclock", "minut", "yard", "morn", "half-past", "dont", "don't", "say", "did", "didn't", "get", "want", "just", "think", "cant", "cannot", "theyr", "wasnt", "wouldnt", "wasn't", "wouldn't", "havent", "haven't", "shouldn't", "should not", "ing", "tion", "con", "com", "pro", "ã‚â£", "ã‚â£m", "â£m", "•", "â–º", "“the", "britain’", "¬", "â", "—")

  #subset(t250d,topic==to.lab$topic250[1])$i.share.14[1:100]
  #subset(t250d,topic==to.lab$topic250[1])$i.share.14[(length(subset(t250d,topic==to.lab$topic250[1])$i.share.14)-100):length(subset(t250d,topic==to.lab$topic250[1])$i.share.14)]

# Add properly encoded time variables

# Month names to numbers
  dx1000$Time <- str_replace(dx1000$Time,"Janua", "01")
  dx1000$Time <- str_replace(dx1000$Time,"Febru", "02")
  dx1000$Time <- str_replace(dx1000$Time,"MÃ", "03")
  dx1000$Time <- str_replace(dx1000$Time,"April", "04")
  dx1000$Time <- str_replace(dx1000$Time,"Mai", "05")
  dx1000$Time <- str_replace(dx1000$Time,"Juni", "06")
  dx1000$Time <- str_replace(dx1000$Time,"Juli", "07")
  dx1000$Time <- str_replace(dx1000$Time,"Augus", "08")
  dx1000$Time <- str_replace(dx1000$Time,"Septe", "09")
  dx1000$Time <- str_replace(dx1000$Time,"Oktob", "10")
  dx1000$Time <- str_replace(dx1000$Time,"Novem", "11")
  dx1000$Time <- str_replace(dx1000$Time,"Dezem", "12")


  list.ts <- list()
  for (i in 1:dim(to.lab)[1]){
    list.ts[[i]] <- subset(t250d,topic==to.lab$topic250[i])
    flush.console()
    print(i)
    }

  list.dma <- pblapply(list.ts,create_dma,target=30,baselines=c(90,180,365,730,1825),align="right")

  list.nw  <- pblapply(list.dma,find_newswaves,target=30,baselines=c(90,180,365,730,1825))

  # list.dma[[1]][,c("DMA30", "DMA90", "DMA180", "DMA365", "DMA730", "DMA1825", "a")][1:100,]

  list.waves <- pblapply(list.nw,waveanalyzer)

  tms_dx <- tms_nzz
  tms_dx %>% dfm_trim(min_termfreq=100) -> tms_dx100
  tms_dx %>% dfm_trim(min_termfreq=1000) -> tms_dx1000

  wd250 <- list.waves[[1]]
  for (i in 2:length(list.waves))
    {
    wd250 <- rbind(wd250,list.waves[[i]])
    flush.console()
    print(i)
    }

  wd250$start.date <- as.Date(wd250$start,origin="1784-12-31")
  wd250$end.date <- as.Date(wd250$end,origin="1784-12-31")
  wd250$year <- str_extract(wd250$start.date,"[:digit:]{4,4}")
  wd250$decade <- floor(as.numeric(wd250$year)/10)*10
  wd250$baseline.volume <- wd250$duration*rowMeans(cbind(wd250[,c("baseline90", "baseline180", "baseline365", "baseline730", "baseline1825")]))
  wd250$r_threshold <- thetimes[match(wd250$year,thetimes$year),"articles"]*0.0004
  wd250$a_threshold <- 5
  wd250$v_threshold <- rowMaxs(cbind(wd250$r_threshold,wd250$a_threshold))
  wd250[wd250$year==1979,"v_threshold"] <- 20


  wd250x <- subset(wd250,(volume+baseline.volume)>(v_threshold/2.4) & intensity>0.001)

  table(wd250x$decade)
  table(wd250x$year)

  ggplot(wd250x,aes(y=duration,x=as.numeric(year)))+geom_point()+geom_smooth()+theme_bluewhite()

  dx1000$days <- as.numeric(as.Date(dx1000$Time)-as.Date("1784-12-31"))
  dx100$days <- as.numeric(as.Date(dx100$Time)-as.Date("1784-12-31"))
  dx$days <- as.numeric(as.Date(dx$Time)-as.Date("1784-12-31"))

  W <- dim(wd250x)[1]
  wave.keywords <- list()
  wave.keywords.100 <- list()
  wave.keywords.1000 <- list()
  for (w in 1:W)
  {
    wave.keywords[[w]] <- compareVocabulary(data=nzz_dx,pre=1000,post=1000,start=wd250x$start[w],end=wd250x$end[w],nover=10,nunder=10,topic=wd250x$topic250[w],area=wd250x$topic50[w])
    wave.keywords.100[[w]] <- compareVocabulary(data=nzz_dx100,pre=1000,post=1000,start=wd250x$start[w],end=wd250x$end[w],nover=10,nunder=10,topic=wd250x$topic250[w],area=wd250x$topic50[w])
    wave.keywords.1000[[w]] <- compareVocabulary(data=nzz_dx1000,pre=1000,post=1000,start=wd250x$start[w],end=wd250x$end[w],nover=10,nunder=10,topic=wd250x$topic250[w],area=wd250x$topic50[w])
    flush.console()
    print(paste0(w,"/",W))
  }

  rs <- round(runif(n=100,min=0.5,max=1052.5))

  wave.keywords[[rs[[1]]]]


  wd250x$nid <- 1:dim(wd250x)[1]

  names(list.dma) <- to.lab$topic250

  i.001 <- nw_zoom(ts=list.dma,nw=wd250x[1,],nw.df=wd250x,tframe=c(365,730,1825,9125))
  i.002 <- nw_zoom(ts=list.dma,nw=wd250x[2,],nw.df=wd250x,tframe=c(365,730,1825,9125))
  i.003 <- nw_zoom(ts=list.dma,nw=wd250x[3,],nw.df=wd250x,tframe=c(365,730,1825,9125))
  i.004 <- nw_zoom(ts=list.dma,nw=wd250x[4,],nw.df=wd250x,tframe=c(365,730,1825,9125))
  i.005 <- nw_zoom(ts=list.dma,nw=wd250x[5,],nw.df=wd250x,tframe=c(365,730,1825,9125))
  i.006 <- nw_zoom(ts=list.dma,nw=wd250x[6,],nw.df=wd250x,tframe=c(365,730,1825,9125))
  i.007 <- nw_zoom(ts=list.dma,nw=wd250x[7,],nw.df=wd250x,tframe=c(365,730,1825,9125))
  i.008 <- nw_zoom(ts=list.dma,nw=wd250x[8,],nw.df=wd250x,tframe=c(365,730,1825,9125))
  i.009 <- nw_zoom(ts=list.dma,nw=wd250x[9,],nw.df=wd250x,tframe=c(365,730,1825,9125))
  i.010 <- nw_zoom(ts=list.dma,nw=wd250x[10,],nw.df=wd250x,tframe=c(365,730,1825,9125))

  i.101 <- nw_zoom(ts=list.dma,nw=wd250x[101,],nw.df=wd250x,tframe=c(365,730,1825,9125))
  i.201 <- nw_zoom(ts=list.dma,nw=wd250x[201,],nw.df=wd250x,tframe=c(365,730,1825,9125))


  validate_nw(x.1=wave.keywords,x.2=wave.keywords.100,x.3=wave.keywords.1000,event_id=1)

  c("1982 Falklands War(*)", "1973 Oil Crisis", "1980 Recession", "1981 Recession/Salvadorian Civil War/Solidarnosc Movement in Poland", "1981 Recession/Irish Hunger Strike", "1982 Falklands War", "1982 NHS Dispute/Crisis", "1990-92 BCCI Crisis/Recession/Iraq War", "1992 Recession", "1992 Recession/Election", #10
  "", "", "", "", "", "", "", "", "", "" #20
  "", "", "", "", "", "", "", "", "", "" #30
  "", "", "", "", "", "", "", "", "", "" #40
  "", "", "", "", "", "", "", "", "", "" #50
  "", "", "", "", "", "", "", "", "", "" #60
  "", "", "", "", "", "", "", "", "", "" #70
  "", "", "", "", "", "", "", "", "", "" #80
  "", "", "", "", "", "", "", "", "", "" #90
  "", "", "", "", "", "", "", "", "", "" #100
  "European Banking Crisis", "", "", "", "", "", "", "", "", "" #110
  "", "", "", "", "", "", "", "", "", "" #120
  "", "", "", "", "", "", "", "", "", "" #130
  "", "", "", "", "", "", "", "", "", "" #140
  "", "", "", "", "", "", "", "", "", "" #150
  "", "", "", "", "", "", "", "", "", "" #160
  "", "", "", "", "", "", "", "", "", "" #170
  "", "", "", "", "", "", "", "", "", "" #180
  "", "", "", "", "", "", "", "", "", "" #190
  "", "", "", "", "", "", "", "", "", "" #200
  "1881 Ganges/Shipwrecks", "", "", "", "", "", "", "", "", "" #210

  ) # 1041


  t250dd <- create_dma(x=t250d,topics=to.lab$topic250,target=30,baselines=c(90,180,365,730,1825))

  ##### Is the 14day moving average above the longer term averages 30d, 90d, 180d, 365d, 730d, 1825d?
  t250d$a90 <- 1*(t250d$i.count.30>t250d$i.count.90)
  t250d$a180 <- 1*(t250d$i.count.30>t250d$i.count.180)
  t250d$a365 <- 1*(t250d$i.count.30>t250d$i.count.365)
  t250d$a730 <- 1*(t250d$i.count.30>t250d$i.count.730)
  t250d$a1825 <- 1*(t250d$i.count.30>t250d$i.count.1825)
  ##### Sum of the long-term averages the 14day moving average exceeds (0-6)
  t250d$a <- t250d$a90+t250d$a180+t250d$a365+t250d$a730+t250d$a1825
  ##### 1-day time lagged version of a, to see when the number of exceedings increases from 0 to 1 or higher.
  t250d$a.lag1 <- c(NA,t250d$a[-(length(t250d$a)-1)])
  ##### Check if the number of long-term averages that are exceeded turns greater than 0, or moves from (greater than 0) to 0.
  t250d$new.topic <- 0
  t250d$new.topic[seq(1,length(t250d$new.topic),86197)] <- 1
  t250d$turn.on <- 1*(t250d$a.lag1<3 & t250d$a>2)
  t250d$turn.off <- 1*(t250d$a<3 & t250d$a.lag1>2)
  ##### Potential news wave active?
  t250d$pnw <- (t250d$turn.on==1 | t250d$a>0)
  ##### Running number of potential news wave
  t250d$pnw.no <- cumsum(t250d$turn.on)
  t250d$a.pnw.no <- (t250d$a>0)*t250d$pnw.no
  ##### Above-average coverage shares
  t250d$v30 <- (t250d$a30)*(t250d$i.share.30-t250d$i.share.30) 
  t250d$v90 <- (t250d$a90)*(t250d$i.share.30-t250d$i.share.90) 
  t250d$v180 <- (t250d$a180)*(t250d$i.share.30-t250d$i.share.180) 
  t250d$v365 <- (t250d$a365)*(t250d$i.share.30-t250d$i.share.365) 
  t250d$v730 <- (t250d$a730)*(t250d$i.share.30-t250d$i.share.730) 
  t250d$v1825 <- (t250d$a1825)*(t250d$i.share.30-t250d$i.share.1825) 
  ##### Above-average coverage count
  t250d$n30 <- (t250d$a30)*(t250d$i.count.30-t250d$i.count.30) 
  t250d$n90 <- (t250d$a90)*(t250d$i.count.30-t250d$i.count.90) 
  t250d$n180 <- (t250d$a180)*(t250d$i.count.30-t250d$i.count.180) 
  t250d$n365 <- (t250d$a365)*(t250d$i.count.30-t250d$i.count.365) 
  t250d$n730 <- (t250d$a730)*(t250d$i.count.30-t250d$i.count.730) 
  t250d$n1825 <- (t250d$a1825)*(t250d$i.count.30-t250d$i.count.1825) 

  t250d$wv30 <- 0
  t250d$wv90 <- 0
  t250d$wv180 <- 0
  t250d$wv365 <- 0
  t250d$wv730 <- 0
  t250d$wv1825 <- 0
  t250d$wn30 <- 0
  t250d$wn90 <- 0
  t250d$wn180 <- 0
  t250d$wn365 <- 0
  t250d$wn730 <- 0
  t250d$wn1825 <- 0


  alldays <- 1:86197
  missdays <- (alldays%in%to250_day$day[1:44074])
  misseddays <- alldays[missdays==FALSE]

  date.table <- data.frame(day=alldays)
  date.table$Date <- as.Date(date.table$day,origin="1784-12-31")

  to250_addday <- data.frame(day=rep(misseddays,times=250),topic=rep(to.lab$topic250,each=length(misseddays)),count=0,total.count=0,share=NA)

  to250d <- rbind(to250_day,to250_addday)

  sorter <- order(to250d$topic,to250d$day)

  t250d <- to250d[sorter,]

  count100k <- seq(0,dim(t250d)[1],100000)
  count1k <- seq(0,dim(t250d)[1],1000)

  t250d$share0 <- replace(t250d$share,is.na(t250d$share),0)
  t250d$i.share <- LOCF(t250d$share)

  t250d$count0 <- replace(t250d$count,is.na(t250d$count),0)

##### Create moving averages for weeks, 14-days, months, quarters, semi-annual, annual, biannual, quinquennial

  for (i in 1:250)
  {
    t250d[t250d$topic==to.lab[i,"topic250"],"i.share.7"] <- rollmean(k=7,x=t250d[t250d$topic==to.lab[i,"topic250"],"share0"],align="right",fill=c(NA,NA,NA))
    t250d[t250d$topic==to.lab[i,"topic250"],"i.share.14"] <- rollmean(k=14,x=t250d[t250d$topic==to.lab[i,"topic250"],"share0"],align="right",fill=c(NA,NA,NA))
    t250d[t250d$topic==to.lab[i,"topic250"],"i.share.30"] <- rollmean(k=30,x=t250d[t250d$topic==to.lab[i,"topic250"],"share0"],align="right",fill=c(NA,NA,NA))
    t250d[t250d$topic==to.lab[i,"topic250"],"i.share.90"] <- rollmean(k=90,x=t250d[t250d$topic==to.lab[i,"topic250"],"share0"],align="right",fill=c(NA,NA,NA))
    t250d[t250d$topic==to.lab[i,"topic250"],"i.share.180"] <- rollmean(k=180,x=t250d[t250d$topic==to.lab[i,"topic250"],"share0"],align="right",fill=c(NA,NA,NA))
    t250d[t250d$topic==to.lab[i,"topic250"],"i.share.365"] <- rollmean(k=365,x=t250d[t250d$topic==to.lab[i,"topic250"],"share0"],align="right",fill=c(NA,NA,NA))
    t250d[t250d$topic==to.lab[i,"topic250"],"i.share.730"] <- rollmean(k=730,x=t250d[t250d$topic==to.lab[i,"topic250"],"share0"],align="right",fill=c(NA,NA,NA))
    t250d[t250d$topic==to.lab[i,"topic250"],"i.share.1825"] <- rollmean(k=1825,x=t250d[t250d$topic==to.lab[i,"topic250"],"share0"],align="right",fill=c(NA,NA,NA))
    print(i)
    flush.console()
  }

  for (i in 1:250)
  {
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.7"] <- rollmean(k=7,x=t250d[t250d$topic==to.lab[i,"topic250"],"count0"],align="right",fill=c(NA,NA,NA))
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.14"] <- rollmean(k=14,x=t250d[t250d$topic==to.lab[i,"topic250"],"count0"],align="right",fill=c(NA,NA,NA))
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.30"] <- rollmean(k=30,x=t250d[t250d$topic==to.lab[i,"topic250"],"count0"],align="right",fill=c(NA,NA,NA))
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.90"] <- rollmean(k=90,x=t250d[t250d$topic==to.lab[i,"topic250"],"count0"],align="right",fill=c(NA,NA,NA))
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.180"] <- rollmean(k=180,x=t250d[t250d$topic==to.lab[i,"topic250"],"count0"],align="right",fill=c(NA,NA,NA))
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.365"] <- rollmean(k=365,x=t250d[t250d$topic==to.lab[i,"topic250"],"count0"],align="right",fill=c(NA,NA,NA))
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.730"] <- rollmean(k=730,x=t250d[t250d$topic==to.lab[i,"topic250"],"count0"],align="right",fill=c(NA,NA,NA))
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.1825"] <- rollmean(k=1825,x=t250d[t250d$topic==to.lab[i,"topic250"],"count0"],align="right",fill=c(NA,NA,NA))
    print(i)
    flush.console()
  }

  for (i in 1:250)
  {
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.7"][1:6] <- t250d[t250d$topic==to.lab[i,"topic250"],"i.count.7"][7]
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.14"][1:13] <- t250d[t250d$topic==to.lab[i,"topic250"],"i.count.14"][14]
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.30"][1:29] <- t250d[t250d$topic==to.lab[i,"topic250"],"i.count.30"][30]
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.90"][1:89] <- t250d[t250d$topic==to.lab[i,"topic250"],"i.count.90"][90]
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.180"][1:179] <- t250d[t250d$topic==to.lab[i,"topic250"],"i.count.180"][180]
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.365"][1:364] <- t250d[t250d$topic==to.lab[i,"topic250"],"i.count.365"][365]
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.730"][1:729] <- t250d[t250d$topic==to.lab[i,"topic250"],"i.count.730"][730]
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.1825"][1:1824] <- t250d[t250d$topic==to.lab[i,"topic250"],"i.count.1825"][1825]
    print(i)
    flush.console()
  }

  for (i in 1:250)
  {
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.7"][1:42999] <- imputeTS::na_locf(t250d[t250d$topic==to.lab[i,"topic250"],"i.count.7"][1:42999],option="nocb") 
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.7"][43000:86197] <- imputeTS::na_locf(t250d[t250d$topic==to.lab[i,"topic250"],"i.count.7"][43000:86197],option="locf") 
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.14"][1:42999] <- imputeTS::na_locf(t250d[t250d$topic==to.lab[i,"topic250"],"i.count.14"][1:42999],option="nocb") 
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.14"][43000:86197] <- imputeTS::na_locf(t250d[t250d$topic==to.lab[i,"topic250"],"i.count.14"][43000:86197],option="locf") 
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.30"][1:42999] <- imputeTS::na_locf(t250d[t250d$topic==to.lab[i,"topic250"],"i.count.30"][1:42999],option="nocb") 
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.30"][43000:86197] <- imputeTS::na_locf(t250d[t250d$topic==to.lab[i,"topic250"],"i.count.30"][43000:86197],option="locf") 
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.90"][1:42999] <- imputeTS::na_locf(t250d[t250d$topic==to.lab[i,"topic250"],"i.count.90"][1:42999],option="nocb") 
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.90"][43000:86197] <- imputeTS::na_locf(t250d[t250d$topic==to.lab[i,"topic250"],"i.count.90"][43000:86197],option="locf") 
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.180"][1:42999] <- imputeTS::na_locf(t250d[t250d$topic==to.lab[i,"topic250"],"i.count.180"][1:42999],option="nocb") 
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.180"][43000:86197] <- imputeTS::na_locf(t250d[t250d$topic==to.lab[i,"topic250"],"i.count.180"][43000:86197],option="locf") 
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.365"][1:42999] <- imputeTS::na_locf(t250d[t250d$topic==to.lab[i,"topic250"],"i.count.365"][1:42999],option="nocb") 
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.365"][43000:86197] <- imputeTS::na_locf(t250d[t250d$topic==to.lab[i,"topic250"],"i.count.365"][43000:86197],option="locf") 
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.730"][1:42999] <- imputeTS::na_locf(t250d[t250d$topic==to.lab[i,"topic250"],"i.count.730"][1:42999],option="nocb") 
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.730"][43000:86197] <- imputeTS::na_locf(t250d[t250d$topic==to.lab[i,"topic250"],"i.count.730"][43000:86197],option="locf") 
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.1825"][1:42999] <- imputeTS::na_locf(t250d[t250d$topic==to.lab[i,"topic250"],"i.count.1825"][1:42999],option="nocb") 
    t250d[t250d$topic==to.lab[i,"topic250"],"i.count.1825"][43000:86197] <- imputeTS::na_locf(t250d[t250d$topic==to.lab[i,"topic250"],"i.count.1825"][43000:86197],option="locf") 
    print(i)
    flush.console()
  }



##### Is the 14day moving average above the longer term averages 30d, 90d, 180d, 365d, 730d, 1825d?
  t250d$a90 <- 1*(t250d$i.count.30>t250d$i.count.90)
  t250d$a180 <- 1*(t250d$i.count.30>t250d$i.count.180)
  t250d$a365 <- 1*(t250d$i.count.30>t250d$i.count.365)
  t250d$a730 <- 1*(t250d$i.count.30>t250d$i.count.730)
  t250d$a1825 <- 1*(t250d$i.count.30>t250d$i.count.1825)

##### Sum of the long-term averages the 14day moving average exceeds (0-6)
  t250d$a <- t250d$a90+t250d$a180+t250d$a365+t250d$a730+t250d$a1825

##### 1-day time lagged version of a, to see when the number of exceedings increases from 0 to 1 or higher.
  t250d$a.lag1 <- c(NA,t250d$a[-(length(t250d$a)-1)])

##### Check if the number of long-term averages that are exceeded turns greater than 0, or moves from (greater than 0) to 0.
  t250d$new.topic <- 0
  t250d$new.topic[seq(1,length(t250d$new.topic),86197)] <- 1
  t250d$turn.on <- 1*(t250d$a.lag1<3 & t250d$a>2)
  t250d$turn.off <- 1*(t250d$a<3 & t250d$a.lag1>2)

##### Potential news wave active?
  t250d$pnw <- (t250d$turn.on==1 | t250d$a>0)

##### Running number of potential news wave
  t250d$pnw.no <- cumsum(t250d$turn.on)
  t250d$a.pnw.no <- (t250d$a>0)*t250d$pnw.no

##### Above-average coverage shares
  t250d$v30 <- (t250d$a30)*(t250d$i.share.30-t250d$i.share.30) 
  t250d$v90 <- (t250d$a90)*(t250d$i.share.30-t250d$i.share.90) 
  t250d$v180 <- (t250d$a180)*(t250d$i.share.30-t250d$i.share.180) 
  t250d$v365 <- (t250d$a365)*(t250d$i.share.30-t250d$i.share.365) 
  t250d$v730 <- (t250d$a730)*(t250d$i.share.30-t250d$i.share.730) 
  t250d$v1825 <- (t250d$a1825)*(t250d$i.share.30-t250d$i.share.1825) 

##### Above-average coverage count
  t250d$n30 <- (t250d$a30)*(t250d$i.count.30-t250d$i.count.30) 
  t250d$n90 <- (t250d$a90)*(t250d$i.count.30-t250d$i.count.90) 
  t250d$n180 <- (t250d$a180)*(t250d$i.count.30-t250d$i.count.180) 
  t250d$n365 <- (t250d$a365)*(t250d$i.count.30-t250d$i.count.365) 
  t250d$n730 <- (t250d$a730)*(t250d$i.count.30-t250d$i.count.730) 
  t250d$n1825 <- (t250d$a1825)*(t250d$i.count.30-t250d$i.count.1825) 

  t250d$wv30 <- 0
  t250d$wv90 <- 0
  t250d$wv180 <- 0
  t250d$wv365 <- 0
  t250d$wv730 <- 0
  t250d$wv1825 <- 0
  t250d$wn30 <- 0
  t250d$wn90 <- 0
  t250d$wn180 <- 0
  t250d$wn365 <- 0
  t250d$wn730 <- 0
  t250d$wn1825 <- 0

  t250dl <- list()

  for (i in 1:length(to.lab$topic250)){
      t250dl[[i]] <- subset(t250d,topic==to.lab$topic250[i])
      print(i)
      flush.console()
    }

  t250d$a.pnw.no <- replace(t250d$a.pnw.no,t250d$a.pnw.no==0,NA)

  thousands <- seq(1000,dim(t250d)[1],1000)



  wavedata <- list()

  for (i in 1:250){
      topic_ts <- subset(t250d,topic==to.lab$topic250[i])
      wavedata[[i]] <- waveanalyzer(x=topic_ts)
      print(i)
      flush.console()
    }

  wd <- wavedata[[1]]
  for (i in 2:250){
      wd <- rbind(wd,wavedata[[i]])
      print(i)
      flush.console()
    }

  wd250y <- wd250x

  load("topics_doc.RData")

  data <- dfm_subset(dx1000,match(dx1000$id,topics_doc$id))
  
  topic.eq$topicX <- topic.eq$topic

# x <- compareVocabulary(data=data,pre=1000,post=1000,start=wd250x$start[4],end=wd250x$end[4],nover=10,nunder=10,topic=wd250x$topic250[4],area=wd250x$topic50[4])



  W <- dim(wd250x)[1]
  wave.keywords <- list()
  for (w in 1:W)
    {
      wave.keywords[[w]] <- compareVocabulary(data=data,pre=1000,post=1000,start=wd250x$start[w],end=wd250x$end[w],nover=10,nunder=10,topic=wd250x$topic250[w],area=wd250x$topic50[w])
      flush.console()
      print(paste0(w,"/",W))
    }

  paste.wordlist <- function(x){
      wl <- x$wordlist
      pwl <- paste(wl,collapse=" ")
      return(pwl)
    }

  paste.headlines <- function(x,hlnum){
      hl <- x$texts[1:hlnum,"headline"]
      phl <- paste(hl,collapse=" ")
      return(phl)
    }

  wd250x$baseline.volume <- wd250x$duration*rowMeans(cbind(wd250x[,c("baseline90", "baseline180", "baseline365", "baseline730", "baseline1825")]))

  wd250x$start.date <- as.Date(wd250x$start,origin="1784-12-31")
  wd250x$end.date <- as.Date(wd250x$end,origin="1784-12-31")

  wd250x$keywords <- lapply(wave.keywords,paste.wordlist)

  wd250x$headlines <- lapply(wave.keywords,paste.headlines,hlnum=10)

  wd250x$event_id <- 1:dim(wd250x)[1]

  wd250x$event_label <- c("1979 Iran Hostage Crisis", "1974 Banking Crisis", "1975 Recession", "1976 Recession", "1980 Recession", "1981 Recession", "1982 Argentina Crisis", "1982 (invalid)", "1991 Recession", "1992 Recession",
    "1992 Extreme Weather", "1998 Asian Crisis", "2020 Recession", "1947 Postwar Reverberations", "1979 Iranian Hostage Crisis", "1979 (invalid)", "1998 (invalid)", "2000 (invalid)", "2004 (invalid) ", "2009 (invalid)",
    "1853 Cholera Epidemic", "1854 Cholera Epidemic", "1885 Cholera in Spain", "1886 Cholera in Italy", "1892 Cholera in Paris", "1893 Cholera", "2008 Lehman Brothers", "2020 Stock market crisis", "1957 Asian Flu", "1967 Foot and Mouth Disease",
    "2001 Foot and Mouth Disease", "2005 Bird Flu", "2009 Swine Flu/Mexican Flu", "2020 COVID-19 Pandemic", "1991 Theatre Season", "1992 Theatre Season", "1993 Theatre Season", "1995 Theatre Season", "1973 Oil Crisis", "2010 Deepwater Horizon Disaster",
    "1832 Cholera in Paris", "1845 Railroad Crisis", "1849 Debate on Theories of Cholera", "1853 Debate on Theories of Cholera", "1854 Railroad disasters report", "1854 Cholera outbreak report", "1857 Fiscal/Banking Crisis Scotland", "1866 Debate on Theories of Cholera", "1868 Abergele rail disaster/report", "1871 Explosives: Gun-cotton", #50
    rep(NA,times=1052-50))
    
    
  wd250x$event_label <- c("1979 Iran Hostage Crisis", "1974 Banking Crisis", "1975 Recession", "1976 Recession", "1980 Recession", "1981 Recession", "1982 Argentina Crisis", "1982 (invalid)", "1991 Recession", "1992 Recession",
    "1992 Extreme Weather", "1998 Asian Crisis", "2020 Recession", "1947 Postwar Reverberations", "1979 Iranian Hostage Crisis", "1979 (invalid)", "1998 (invalid)", "2000 (invalid)", "2004 (invalid) ", "2009 (invalid)",
    "1853 Cholera Epidemic", "1854 Cholera Epidemic", "1885 Cholera in Spain", "1886 Cholera in Italy", "1892 Cholera in Paris", "1893 Cholera", "2008 Lehman Brothers", "2020 Stock market crisis", "1957 Asian Flu", "1967 Foot and Mouth Disease",
    "2001 Foot and Mouth Disease", "2005 Bird Flu", "2009 Swine Flu/Mexican Flu", "2020 COVID-19 Pandemic", "1991 Theatre Season", "1992 Theatre Season", "1993 Theatre Season", "1995 Theatre Season", "1973 Oil Crisis", "2010 Deepwater Horizon Disaster",
    "1832 Cholera in Paris", "1845 Railroad Crisis", "1849 Debate on Theories of Cholera", "1853 Debate on Theories of Cholera", "1854 Railroad disasters report", "1854 Cholera outbreak report", "1857 Fiscal/Banking Crisis Scotland", "1866 Debate on Theories of Cholera", "1868 Abergele rail disaster/report", "1871 Explosives: Gun-cotton", #50    
    "1872 Constitutional Crisis in Germany and France", "1873 Wigan Railway Accident/series of accidents", "1874 Shipton Railway Accident/series of accidents", "1876 Debate on Theories of Epidemics", "1878 Yellow Fever/series of accidents", "1879 Tay-Bridge Disaster", "1880 The Eastern Crisis", "1883 SS Daphne Disaster/series of accidents/Cholera outbreaks", "1885 Ministerial Crisis/colliery explosions/Cholera outbreaks", "1885 Cholera in Spain/Eastern Crisis",#60
    "1886/87 Life Boat Disasters", "1887 Series of Fires, explosions, epidemics", "1890 Asiatic Flu/Russian Flu", "1890 Shipping/Financial Crisis", "1892 Cholera in Europe", "1894 (invalid)", "1895 Venezuelan Crisis", "1897 Typhoid Epidemic (Maidstone)", "1899 Crisis in Hungary", "1899 Crisis in the Church/Bullfinch disaster/Transvaal Crisis",#70
    "1901 Crisis in the British Industry (strikes)", "1903 Hungarian Crisis/Epidemic at Cambridge", "1903 Crisis in the Far East", "1907 Education crisis", "1907 Banking Crisis (Panic of 1907)", "1908 Crisis in Eastern Europe", "1910 Rat Plague in East Anglia/1911 Whitehaven Colliery Disaster", "1913 Aisgill Rail Accident/1913 Mine Disasters", "1914 Pre-War Crises/Ireland", "1931 Financial Crisis", #80
    "1938 Sudeten Crisis/Munich Agreement", "1947 Coal/Energy/Industry Crisis", "1947 Currency/Financial Crisis", "1973 Oil Crisis", "1978 (invalid)", "1979 Afghanistan War/Iran Hostage Crisis", "1982 Falkland War", "1845 French Agricultural Crisis", "1973 Oil Crisis", "1980 Recession",#90
    "1990-1992 Recession", "1997 Asian Financial Crisis", "2008 Financial Crisis", "2011 European Debt Crisis", "2020 Covid Pandemic", "1987 Ferry Disaster (MS Herald of Free Enterprise)", "1832 Cholera Epidemic (Paris/London)", "1930 R101 Airship Disaster", "1931 Financial Crisis", "1979 Afghanistan-Soviet War/1979 Iranian Hostage Crisis",#100
    "1980 Strikes/Protests in Poland", "1842 Spanish Ministerial Crisis", "1845 Ministerial Crisis in England", "1846 Affairs of Spain", "1851 Ministerial Crisis in England (Lord John Russell)", "1870 Italian Revolution", "1872 Constitutional Crisis in Prussia/Ministerial Crisis in Hungary", "1876 Ministerial Crisis in France", "1877 Crisis in France", "1880 Eastern Crisis", # 110
    "1882 Crisis in Egypt", "1883 Crisis in France/Spain", "1885 Ministerial Crisis (Gladstone)", "1886 Greek Crisis/Eastern Crisis", "1886 Ministerial Crisis (Salisbury)", "1887 Ministerial Crisis in France", "1887 Ministerial Crisis in France", "1881 Ministerial Crisis in France", "1889 Japanese Cabinet Crisis", "1890 Ministerial Crisis in Portugal", # 120
    "1891 Cabinet Crisis in Norway and Italy", "1892 Cabinet Crisis in Italy, France, and Greece", "1893 Crisis in Serbia/Austria", "1894 Ministerial Crisis in Belgium", "1894 Ministerial14 Crisis in Hungary, Italy, and Bulgaria", "1894/95 Resignation of the French President", "1897 Eastern Crisis (Greco-Turkish War)", "1897 Ministerial Crisis in Italy/Crisis in Austria", "1898 Crisis in France/Ministerial Crisis in Italy", "1899 Crisis in Hungary", # 130
    "1899 French Cabinet Crisis", "1901 Cabinet Crisis in Spain, Romania, and Italy", "1903 Bulgarian Cabinet Crisis/Balkan Crisis", "1903 Hungarian Crisis/Crisis in Greece", "1903 Hungarian Crisis", "1904 Hungarian Crisis", "1905 Swedish-Norwegian Conflict", "1909 Greek Crisis/Hungarian Crisis", "1914 Resignation of Egyptian, Italian, and Japanese Cabinets", "1923 Cabinet Crises in Germany, Poland, Netherlands, Portugal/Bengal Crisis", # 140
    "1925 Belgian Cabinet Crisis", "1944 Post-Nazi Government Crises (Italy, Belgium, Norway, Poland, Greece)", "1849 Cholera Epidemic", "1854 Cholera in North England", "1899 Transvaal Crisis", "1900 Boxer Rebellion", "1914 Sealing Disaster (SS Newfoundland)/Empress of Ireland Disaster/War Preparations", "1931 British Honduras Disaster (hurricane)/1931 Currency Crisis", "1971 Currency Crisis/Bretton Woods Crisis", "1980 Afghanistan War/Iran Hostage Crisis",#150
    "1887 Scarlet Fever Epidemic", "1890 Influenza Epidemic", "1918 Spanish Flu", "1938 Nazi Threat/War Preparations", "1947 Energy and Industry Recovery", "1947 Cholera in Punjab and Egypt", "1957 Recession", "1973 Oil Crisis", "1978 Economic Problems", "1979 Afghanistan-Soviet War",#160
    "1980/81 Unemployment Surge/Recession", "1982 Flu & Pertussis Epidemics/Falkland War", "1991 Recession", "1998 Flu Epidemic", "2008 Financial Crisis", "2015 NHS Crisis (A&E capacity)", "2015 Migration Crisis (stress on NHS)", "2020 Covid Pandemic (A&E capacity)", "1987 NHS Crisis (hospital beds)", "2014 NHS crisis (A&E capacity)",#170
    "2017 NHS Crisis (staff shortages)", "2017/18 Flu Season and NHS Crisis", "2020 COVID Crisis/strain on NHS", "2014/2015 Migrantion Crisis", "2007 Mortgage Crisis (Northern Rock)", "2008 Financial Crisis", "2011 European Debt Crisis", "2015 Migration Crisis (Greece)", "2020 COVID Crisis", "1985 Boeing Disaster (British Airtours 28M",#180
    "1988 Lockerbie Air Disaster (PanAm 103 Crash)", "1973 Oil Crisis (Japan)", "1983 Oil Output", "2008 Lehman Brothers (Oil Prices)", "2014-2016 Oil Price Fall", "2008 Financial Crisis (Pensions)", "2016 BHS Collapse (retailer)/NHS Pensions", "2020 COVID Crisis (Pensions Crisis)", "1852 RMS Amazon Shipwreck", "1873 RMS Atlantic Shipwreck",#190
    "1876 Great Queensland Shipwreck", "1877 (various shipwrecks)", "1881 RMS Teuton Shipwreck", "1884 Daniel Steinmann shipwreck", "1890 HMS Serpent/Ertugrul Shipwrecks", "1891 Utopia shipwreck", "1892 Bokhara shipwreck/SS Romania shipwreck", "1900 Boxer Rebellion (shipping of troops)", "1901 The Russie Shipwreck", "1904 Russo-Japanese War",#200
    "1912 RMS Titanic Shipwreck", "1913-1914 Irish Crisis", "1931 German Crisis", "1938 Sudeten Crisis (negotiations)", "1947 Coal Crisis (negotiations)", "1947 Financial Crisis/Marshall Plan", "1956 Suez Crisis", "1958 Taiwan Straits Crisis", "1961 Berlin Crisis", "1962 Cuba Crisis",#210
    "1964 Cyprus Crisis", "1968 Money Talks (revaluation of German Mark)", "1971 Monetary Crisis", "1973 Currency crisis", "1973 Oil Crisis", "1974 Turkish Invasion of Cyprus", "1979 Irianian Hostage Crisis/1979 Afghanistan War", "1980 Cyprus Negotiations", "1982 Falkland War: Peruvian Peace Initiative", "1986 Israel Cabinet Crisis/Chernobyl",#220
    "2008 Financial Crisis: Bailouts", "2011 European Debt Crisis", "2015 Migration Crisis", "1913 Senghenydd Colliery Disaster/Sealing Shipwreck/Empress of Ireland Shipwreck", "2008 Financial Crisis", "2020 COVID-19 Crisis", "1985 Tin Crisis", "2016 Brexit Referendum", "2018-2019 Brexit Negotiations", "2019 Brexit Consequences",#230
    "1878 Apedale/Unity Brook/Kersley Colliery Disasters", "1889 Penicuick Pit Disaster/Antwerp/Mossfield", "1889-1890 Mauricewook/Morfa/Pontypool Colliery Disasters", "1893 Thornhill Colliery Explosion/Coal Trade Crisis", "1908 Wigan Colliery Disaster", "1910 Whitehaven Pit Disaster", "1910 Houlton/Pretoria Pit Disaster (Bolton)", "1914 Senghenydd Colliery Disaster", "1950 Creswell Colliery Disaster", "1971 Currency Revaluations",#240
    "1982 Recession", "1973 Oil Crisis: Inflation", "1979 Recession", "1980 Recession", "2008 Financial Crisis", "2020 COVID-19 Crisis", "1988 King's Cross Underground Fire Report", "1903 Crisis in the Far East (pre Russo-Japanese War)", "2020 COVID-19 Crisis: Psychosocial consequences", "1997 Hong Kong Crisis",#250
    "2008 Financial Crisis: Job market for graduates", "2020 COVID-19 Crisis: Job market requirements", "1947 Industry/Steel/Coal Crisis", "1947 Industry/Steel/Coal Crisis (2)", "1973 Oil Crisis: Steel Industry", "1979 Steel Strikes", "1980 Steel Quotas Negotiations", "2020 COVID-19 Crisis: Hospitals and Long-term effects", "2015 Migration Crisis", "1979 Mount Erebus Disaster",#260
    "1973 Oil Crisis: Regulations/Restrictions/Rationing", "2020 COVID-19 Crisis: Regulations/Restrictions/Rationing", "1992: (Personal Crises)", "2008: Financial Crisis (Personal Crises)", "2011: European Debt Crisis (Personal crises)", "2015: Migration Crisis (Personal crises)", "2020: COVID-19 Crisis (Personal crises)", "1931: UK General Election (MacDonald, National)", "1982: Falkland War: Tory Wins at Local Elections", "2020: COVID-19 Crisis",#270
    "1887 Scarlet Fever Pandemic (London), strain on hospitals", "2020 COVID-19 Crisis: Strain on hospitals", "1979 Societ-Afghan War/Pakistan Crisis", "2018 BHS Report (Business auditors)", "2020 COVID-19 Crisis: Auditing shortages", "1853 Railway Accidents (various minor)", "1861 London and Brighton Railway Disaster", "1868 Abergele Railway Disaster", "1873 Wigan Railway Accident", "1874 Shipton Train Crash",#280
    "1880 Kibworth Railway Accident", "1907 Shewsbury Rail Disaster", "1910 Hawes Junction Rail Crash", "1913 Colchester Railway Accident", "1874 Shipton Railway Accident", "1876 (various shipwrecks)", "1878 Princess Alice Shipwreck", "1882 Egypt Crisis (Navy)/Anglo-Egyptian War", "1885 Eastern Crisis/Bulgarian Crisis", "1890 (invalid/Shipping)",#290
    "1893 Victoria Shipwreck/Cholera/Coal Trade Crisis", "1900 Boxer Rebellion (Navy)", "1916 Greek Crisis/Collision in the Irish Channel", "1864 Shipwreck at Fifeness/Glasgow Harbor/SS Iowa", "1873 Ville du Havre Shipwreck", "1877 Avelanche/Forest Ship Collision", "1877 Various Shipping Disasters", "1881 Teuton Shipwreck", "1883 Cimbria Shipwreck", "1884 Daniel Steinmann/State of Florida Shipwrecks",#300
    "", "", "", "", "", "", "", "", "", "",#310
    "", "", "", "", "", "", "", "", "", "",#320
    "", "", "", "", "", "", "", "", "", "",#330
    "", "", "", "", "", "", "", "", "", "",#340
    "", "", "", "", "", "", "", "", "", "",#350
    "", "", "", "", "", "", "", "", "", "",#360
    "", "", "", "", "", "", "", "", "", "",#370
    "", "", "", "", "", "", "", "", "", "",#380
    "", "", "", "", "", "", "", "", "", "",#390
    "", "", "", "", "", "", "", "", "", "",#400
    "", "", "", "", "", "", "", "", "", "",#410
    "", "", "", "", "", "", "", "", "", "",#420
    "", "", "", "", "", "", "", "", "", "",#430
    "", "", "", "", "", "", "", "", "", "",#440
    "", "", "", "", "", "", "", "", "", "",#450
    "", "", "", "", "", "", "", "", "", "",#460
    "", "", "", "", "", "", "", "", "", "",#470
    "", "", "", "", "", "", "", "", "", "",#480
    "", "", "", "", "", "", "", "", "", "",#490
    "", "", "", "", "", "", "", "", "", "",#500
    "", "", "", "", "", "", "", "", "", "",#510
    "", "", "", "", "", "", "", "", "", "",#520
    "", "", "", "", "", "", "", "", "", "",#530
    "", "", "", "", "", "", "", "", "", "",#540
    "", "", "", "", "", "", "", "", "", "",#550
    "", "", "", "", "", "", "", "", "", "",#560
    "", "", "", "", "", "", "", "", "", "",#570
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",#600
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",#650
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",#700
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",#750
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",#800
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",#850
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",#900
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",#950
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",#1000
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",
    "", "", "", "", "", "", "", "", "", "",#1050
    "", "")

  t250d.flu <- subset(t250d,topic=="flu/disease/epidemic/virus/infection/pandemic/infect")
  t250d.flu <- list.dma[[which(to.lab$topic250=="flu/disease/epidemic/virus/infection/pandemic/infect")]]
  t250d.flu$date <- as.Date(t250d.flu$day,origin="1784-12-31")

  t250d.flu.long <- melt(t250d.flu,measure.vars=c("DMA30", "DMA90", "DMA180", "DMA365", "DMA730", "DMA1825"))

  flu.235y <- ggplot(t250d.flu.long,aes(x=date,y=value,group=variable,color=variable))+geom_line()+geom_rect(xmin=as.Date("1969-12-31"),xmax=as.Date("2021-01-01"),ymin=0,ymax=0.10,fill=NA,size=2,color="darkgrey")+theme_bluewhite()+xlab("Time")+ylab("News stories per day")

  flu.50y <- ggplot(subset(t250d.flu.long,date<as.Date("2021-01-01") & date>as.Date("1969-12-31")),aes(x=date,y=value,group=variable,color=variable))+geom_line()+geom_rect(xmin=as.Date("1999-12-31"),xmax=as.Date("2021-01-01"),ymin=0,ymax=0.10,fill=NA,size=2,color="darkgrey")+theme_bluewhite()+xlab("Time")+ylab("News stories per day")

  flu.20y <- ggplot(subset(t250d.flu.long,date<as.Date("2021-01-01") & date>as.Date("1999-12-31")),aes(x=date,y=value,group=variable,color=variable))+geom_line()+geom_rect(xmin=as.Date("2009-01-01"),xmax=as.Date("2021-01-01"),ymin=0,ymax=0.10,fill=NA,size=2,color="darkgrey")+theme_bluewhite()+xlab("Time")+ylab("News stories per day")

  flu.10y <- ggplot(subset(t250d.flu.long,date<as.Date("2021-01-01") & date>as.Date("2009-12-31")),aes(x=date,y=value,group=variable,color=variable))+geom_line()+geom_rect(xmin=as.Date("2016-12-31"),xmax=as.Date("2021-01-01"),ymin=0,ymax=0.10,fill=NA,size=2,color="darkgrey")+theme_bluewhite()+xlab("Time")+ylab("News stories per day")

  flu.4y <- ggplot(subset(t250d.flu.long,date<as.Date("2021-01-01") & date>as.Date("2016-12-31")),aes(x=date,y=value,group=variable,color=variable))+geom_line()+geom_rect(xmin=as.Date("2019-12-31"),xmax=as.Date("2021-01-01"),ymin=0,ymax=0.10,fill=NA,size=2,color="darkgrey")+theme_bluewhite()+xlab("Time")+ylab("News stories per day")

  flu.1y <- ggplot(subset(t250d.flu.long,date<as.Date("2021-01-01") & date>as.Date("2019-12-31")),aes(x=date,y=value,group=variable,color=variable))+geom_line()+theme_bluewhite()+xlab("Time")+ylab("News stories per day")

  infection.graphic <- grid.arrange(flu.235y,flu.50y,flu.20y,flu.10y,flu.4y,flu.1y,ncol=1)

  t250d.war <- subset(t250d,topic=="foe/enemy/victory/defeat/attack/ally/upon")
  t250d.war <- list.dma[[which(to.lab$topic250=="foe/enemy/victory/defeat/attack/ally/upon")]]
  t250d.war$date <- as.Date(t250d.war$day,origin="1784-12-31")
  t250d.war.long <- melt(t250d.war,measure.vars=c("DMA30", "DMA90", "DMA180", "DMA365", "DMA730", "DMA1825"))

  war.235y <- ggplot(t250d.war.long,aes(x=date,y=value,group=variable,color=variable))+geom_line()+geom_rect(xmin=as.Date("1950-01-01"),xmax=as.Date("1899-12-31"),ymin=0,ymax=0.10,fill=NA,size=2,color="darkgrey")+theme_bluewhite()+xlab("Time")+ylab("News stories per day")

  war.50y <- ggplot(subset(t250d.war.long,date<as.Date("1950-01-01") & date>as.Date("1899-12-31")),aes(x=date,y=value,group=variable,color=variable))+geom_line()+geom_rect(xmin=as.Date("1950-01-01"),xmax=as.Date("1929-01-01"),ymin=0,ymax=0.10,fill=NA,size=2,color="darkgrey")+theme_bluewhite()+xlab("Time")+ylab("News stories per day")

  war.20y <- ggplot(subset(t250d.war.long,date<as.Date("1950-01-01") & date>as.Date("1929-12-31")),aes(x=date,y=value,group=variable,color=variable))+geom_line()+geom_rect(xmin=as.Date("1936-12-31"),xmax=as.Date("1947-01-01"),ymin=0,ymax=0.10,fill=NA,size=2,color="darkgrey")+theme_bluewhite()+xlab("Time")+ylab("News stories per day")

  war.10y <- ggplot(subset(t250d.war.long,date<as.Date("1947-01-01") & date>as.Date("1936-12-31")),aes(x=date,y=value,group=variable,color=variable))+geom_line()+geom_rect(xmin=as.Date("1937-12-31"),xmax=as.Date("1941-01-01"),ymin=0,ymax=0.10,fill=NA,size=2,color="darkgrey")+theme_bluewhite()+xlab("Time")+ylab("News stories per day")

  war.4y <- ggplot(subset(t250d.war.long,date<as.Date("1941-01-01") & date>as.Date("1937-12-31")),aes(x=date,y=value,group=variable,color=variable))+geom_line()+geom_rect(xmin=as.Date("1938-01-01"),xmax=as.Date("1939-01-01"),ymin=0,ymax=0.10,fill=NA,size=2,color="darkgrey")+theme_bluewhite()+xlab("Time")+ylab("News stories per day")

  war.1y <- ggplot(subset(t250d.war.long,date<as.Date("1939-01-01") & date>as.Date("1938-01-01")),aes(x=date,y=value,group=variable,color=variable))+geom_line()+theme_bluewhite()+xlab("Time")+ylab("News stories per day")

  appeasement.graphic <- grid.arrange(war.235y,war.50y,war.20y,war.10y,war.4y,war.1y,ncol=1)

  ggsave(infection.graphic,file="infect.svg",dpi=600,unit="cm",scale=1.5,width=16,height=20)
  ggsave(appeasement.graphic,file="appease.svg",dpi=600,unit="cm",scale=1.5,width=16,height=20)

  ggsave(infection.graphic,file="infect.png",dpi=600,unit="cm",scale=1.5,width=16,height=20)
  ggsave(appeasement.graphic,file="appease.png",dpi=600,unit="cm",scale=1.5,width=16,height=20)

  wavedata250 <- list()

  for (i in 1:250){
      topic_ts <- subset(t250d,topic==to.lab$topic250[i])
      wavedata250[[i]] <- waveanalyzer(x=topic_ts)
      print(i)
      flush.console()
    }

  wd250 <- wavedata250[[1]]

  for (i in 2:250){
        wd250 <- rbind(wd250,wavedata250[[i]])
        print(i)
        flush.console()
    }

  wd$year <- str_extract(as.Date(wd$start,origin="1784-12-31"),pattern="[:digit:]{4,4}")

  wd$r_threshold <- thetimes[match(wd$year,thetimes$year),"articles"]*0.00005
  wd$a_threshold <- 0.5
  wd$v_threshold <- rowMaxs(cbind(wd$r_threshold,wd$a_threshold))

  wd$topic50 <- to.lab[match(wd$topic250,to.lab$topic250),"topic50"]
  wd$topic20 <- to.lab[match(wd$topic250,to.lab$topic250),"topic20"]

  wd$decade <- as.numeric(str_extract(wd$year,pattern="^[:digit:]{3,3}"))*10

  wdx <- subset(wd,volume>v_threshold)

  wd250x$topic50 <- to.lab[match(wd250x$topic250,to.lab$topic250),"topic50"]

  # save(wdx,file="wdx.RData")
  # save(wd,file="wd.RData")

  load("stm250.RData")
  load("ox1000.RData")
  load("dx1000.RData")

  dx1000$days <- as.numeric(as.Date(dx1000$Time)-as.Date("1784-12-31"))

  wide.STM <- data.frame(stm250$theta,ox1000$meta[!is.na(ox1000$meta$year),])

  W <- dim(wd250x)[1]
  wave.keywords <- list()

  for (w in 1:W){
      wave.keywords[[w]] <- compareVocabulary(data=dx1000,pre=1000,post=1000,start=wd250x$start[w],end=wd250x$end[w],nover=10,nunder=10,topic=wd250x$topic250[w],area=wd250x$topic50[w])
      flush.console()
      print(paste0(w,"/",W))
    }

  alldays <- 1:86197
  missdays <- (alldays%in%to20_day$day[1:44074])
  misseddays <- alldays[missdays==FALSE]

  to20_addday <- data.frame(day=rep(misseddays,times=21),topic=rep(names(table(to.lab$topic20)),each=length(misseddays)),count=0,total.count=0,share=NA)

  to20d <- rbind(to20_day,to20_addday)

  sorter <- order(to20d$topic,to20d$day)

  t20d <- to20d[sorter,]

  count100k <- seq(0,dim(t20d)[1],100000)
  count1k <- seq(0,dim(t20d)[1],1000)

  t20d$share0 <- replace(t20d$share,is.na(t20d$share),0)
  t20d$i.share <- LOCF(t20d$share)

  t20d$count0 <- replace(t20d$count,is.na(t20d$count),0)


##### Create moving averages for weeks, 14-days, months, quarters, semi-annual, annual, biannual, quinquennial

  to.lab.20 <- unique(to.lab$topic20)[-5]

  for (i in 1:21)
  {
    t20d[t20d$topic==to.lab.20[i],"i.share.7"] <- rollmean(k=7,x=t20d[t20d$topic==to.lab.20[i],"share0"],align="center",fill=c(NA,NA,NA))
    t20d[t20d$topic==to.lab.20[i],"i.share.14"] <- rollmean(k=14,x=t20d[t20d$topic==to.lab.20[i],"share0"],align="center",fill=c(NA,NA,NA))
    t20d[t20d$topic==to.lab.20[i],"i.share.30"] <- rollmean(k=30,x=t20d[t20d$topic==to.lab.20[i],"share0"],align="center",fill=c(NA,NA,NA))
    t20d[t20d$topic==to.lab.20[i],"i.share.90"] <- rollmean(k=90,x=t20d[t20d$topic==to.lab.20[i],"share0"],align="center",fill=c(NA,NA,NA))
    t20d[t20d$topic==to.lab.20[i],"i.share.180"] <- rollmean(k=180,x=t20d[t20d$topic==to.lab.20[i],"share0"],align="center",fill=c(NA,NA,NA))
    t20d[t20d$topic==to.lab.20[i],"i.share.365"] <- rollmean(k=365,x=t20d[t20d$topic==to.lab.20[i],"share0"],align="center",fill=c(NA,NA,NA))
    t20d[t20d$topic==to.lab.20[i],"i.share.730"] <- rollmean(k=730,x=t20d[t20d$topic==to.lab.20[i],"share0"],align="center",fill=c(NA,NA,NA))
    t20d[t20d$topic==to.lab.20[i],"i.share.1825"] <- rollmean(k=1825,x=t20d[t20d$topic==to.lab.20[i],"share0"],align="center",fill=c(NA,NA,NA))
    print(i)
    flush.console()
  }

  for (i in 1:21)
  {
    t20d[t20d$topic==to.lab.20[i],"i.count.7"] <- rollmean(k=7,x=t20d[t20d$topic==to.lab.20[i],"count0"],align="center",fill=c(NA,NA,NA))
    t20d[t20d$topic==to.lab.20[i],"i.count.14"] <- rollmean(k=14,x=t20d[t20d$topic==to.lab.20[i],"count0"],align="center",fill=c(NA,NA,NA))
    t20d[t20d$topic==to.lab.20[i],"i.count.30"] <- rollmean(k=30,x=t20d[t20d$topic==to.lab.20[i],"count0"],align="center",fill=c(NA,NA,NA))
    t20d[t20d$topic==to.lab.20[i],"i.count.90"] <- rollmean(k=90,x=t20d[t20d$topic==to.lab.20[i],"count0"],align="center",fill=c(NA,NA,NA))
    t20d[t20d$topic==to.lab.20[i],"i.count.180"] <- rollmean(k=180,x=t20d[t20d$topic==to.lab.20[i],"count0"],align="center",fill=c(NA,NA,NA))
    t20d[t20d$topic==to.lab.20[i],"i.count.365"] <- rollmean(k=365,x=t20d[t20d$topic==to.lab.20[i],"count0"],align="center",fill=c(NA,NA,NA))
    t20d[t20d$topic==to.lab.20[i],"i.count.730"] <- rollmean(k=730,x=t20d[t20d$topic==to.lab.20[i],"count0"],align="center",fill=c(NA,NA,NA))
    t20d[t20d$topic==to.lab.20[i],"i.count.1825"] <- rollmean(k=1825,x=t20d[t20d$topic==to.lab.20[i],"count0"],align="center",fill=c(NA,NA,NA))
    print(i)
    flush.console()
  }

  for (i in 1:21)
  {
    t20d[t20d$topic==to.lab.20[i],"i.count.7"][1:42999] <- imputeTS::na_locf(t20d[t20d$topic==to.lab.20[i],"i.count.7"][1:42999],option="nocb") 
    t20d[t20d$topic==to.lab.20[i],"i.count.7"][43000:86197] <- imputeTS::na_locf(t20d[t20d$topic==to.lab.20[i],"i.count.7"][43000:86197],option="locf") 
    t20d[t20d$topic==to.lab.20[i],"i.count.14"][1:42999] <- imputeTS::na_locf(t20d[t20d$topic==to.lab.20[i],"i.count.14"][1:42999],option="nocb") 
    t20d[t20d$topic==to.lab.20[i],"i.count.14"][43000:86197] <- imputeTS::na_locf(t20d[t20d$topic==to.lab.20[i],"i.count.14"][43000:86197],option="locf") 
    t20d[t20d$topic==to.lab.20[i],"i.count.30"][1:42999] <- imputeTS::na_locf(t20d[t20d$topic==to.lab.20[i],"i.count.30"][1:42999],option="nocb") 
    t20d[t20d$topic==to.lab.20[i],"i.count.30"][43000:86197] <- imputeTS::na_locf(t20d[t20d$topic==to.lab.20[i],"i.count.30"][43000:86197],option="locf") 
    t20d[t20d$topic==to.lab.20[i],"i.count.90"][1:42999] <- imputeTS::na_locf(t20d[t20d$topic==to.lab.20[i],"i.count.90"][1:42999],option="nocb") 
    t20d[t20d$topic==to.lab.20[i],"i.count.90"][43000:86197] <- imputeTS::na_locf(t20d[t20d$topic==to.lab.20[i],"i.count.90"][43000:86197],option="locf") 
    t20d[t20d$topic==to.lab.20[i],"i.count.180"][1:42999] <- imputeTS::na_locf(t20d[t20d$topic==to.lab.20[i],"i.count.180"][1:42999],option="nocb") 
    t20d[t20d$topic==to.lab.20[i],"i.count.180"][43000:86197] <- imputeTS::na_locf(t20d[t20d$topic==to.lab.20[i],"i.count.180"][43000:86197],option="locf") 
    t20d[t20d$topic==to.lab.20[i],"i.count.365"][1:42999] <- imputeTS::na_locf(t20d[t20d$topic==to.lab.20[i],"i.count.365"][1:42999],option="nocb") 
    t20d[t20d$topic==to.lab.20[i],"i.count.365"][43000:86197] <- imputeTS::na_locf(t20d[t20d$topic==to.lab.20[i],"i.count.365"][43000:86197],option="locf") 
    t20d[t20d$topic==to.lab.20[i],"i.count.730"][1:42999] <- imputeTS::na_locf(t20d[t20d$topic==to.lab.20[i],"i.count.730"][1:42999],option="nocb") 
    t20d[t20d$topic==to.lab.20[i],"i.count.730"][43000:86197] <- imputeTS::na_locf(t20d[t20d$topic==to.lab.20[i],"i.count.730"][43000:86197],option="locf") 
    t20d[t20d$topic==to.lab.20[i],"i.count.1825"][1:42999] <- imputeTS::na_locf(t20d[t20d$topic==to.lab.20[i],"i.count.1825"][1:42999],option="nocb") 
    t20d[t20d$topic==to.lab.20[i],"i.count.1825"][43000:86197] <- imputeTS::na_locf(t20d[t20d$topic==to.lab.20[i],"i.count.1825"][43000:86197],option="locf") 
    print(i)
    flush.console()
  }



##### Is the 14day moving average above the longer term averages 30d, 90d, 180d, 365d, 730d, 1825d?
  t20d$a30 <- 1*(t20d$i.count.30>t20d$i.count.30)
  t20d$a90 <- 1*(t20d$i.count.30>t20d$i.count.90)
  t20d$a180 <- 1*(t20d$i.count.30>t20d$i.count.180)
  t20d$a365 <- 1*(t20d$i.count.30>t20d$i.count.365)
  t20d$a730 <- 1*(t20d$i.count.30>t20d$i.count.730)
  t20d$a1825 <- 1*(t20d$i.count.30>t20d$i.count.1825)
##### Sum of the long-term averages the 14day moving average exceeds (0-6)
  t20d$a <- t20d$a90+t20d$a180+t20d$a365+t20d$a730+t20d$a1825
##### 1-day time lagged version of a, to see when the number of exceedings increases from 0 to 1 or higher.
  t20d$a.lag1 <- c(NA,t20d$a[-(length(t20d$a)-1)])
##### Check if the number of long-term averages that are exceeded turns greater than 0, or moves from (greater than 0) to 0.
  t20d$new.topic <- 0
  t20d$new.topic[seq(1,length(t20d$new.topic),86197)] <- 1
  t20d$turn.on <- 1*(t20d$a.lag1<3 & t20d$a>2)
  t20d$turn.off <- 1*(t20d$a<3 & t20d$a.lag1>2)
  t20d$turn.on <- ifelse(is.na(t20d$turn.on),0,t20d$turn.on)
##### Potential news wave active?
  t20d$pnw <- (t20d$turn.on==1 | t20d$a>2)
##### Running number of potential news wave
  t20d$pnw.no <- cumsum(t20d$turn.on)
  t20d$a.pnw.no <- (t20d$a>2)*t20d$pnw.no
##### Above-average coverage shares
  t20d$v30 <- (t20d$a30)*(t20d$i.share.30-t20d$i.share.30) 
  t20d$v30 <- (t20d$a30)*(t20d$i.share.30-t20d$i.share.30) 
  t20d$v90 <- (t20d$a90)*(t20d$i.share.30-t20d$i.share.90) 
  t20d$v180 <- (t20d$a180)*(t20d$i.share.30-t20d$i.share.180) 
  t20d$v365 <- (t20d$a365)*(t20d$i.share.30-t20d$i.share.365) 
  t20d$v730 <- (t20d$a730)*(t20d$i.share.30-t20d$i.share.730) 
  t20d$v1825 <- (t20d$a1825)*(t20d$i.share.30-t20d$i.share.1825) 
##### Above-average coverage count
  t20d$n30 <- (t20d$a30)*(t20d$i.count.30-t20d$i.count.30) 
  t20d$n90 <- (t20d$a90)*(t20d$i.count.30-t20d$i.count.90) 
  t20d$n180 <- (t20d$a180)*(t20d$i.count.30-t20d$i.count.180) 
  t20d$n365 <- (t20d$a365)*(t20d$i.count.30-t20d$i.count.365) 
  t20d$n730 <- (t20d$a730)*(t20d$i.count.30-t20d$i.count.730) 
  t20d$n1825 <- (t20d$a1825)*(t20d$i.count.30-t20d$i.count.1825) 

  t20d$wv30 <- 0
  t20d$wv90 <- 0
  t20d$wv180 <- 0
  t20d$wv365 <- 0
  t20d$wv730 <- 0
  t20d$wv1825 <- 0
  t20d$wn30 <- 0
  t20d$wn90 <- 0
  t20d$wn180 <- 0
  t20d$wn365 <- 0
  t20d$wn730 <- 0
  t20d$wn1825 <- 0

  t20dl <- list()

  for (i in 1:length(to.lab$topic20))
  {
    t20dl[[i]] <- subset(t20d,topic==to.lab$topic20[i])
    print(i)
    flush.console()
  }

  t20d$a.pnw.no <- replace(t20d$a.pnw.no,t20d$a.pnw.no==0,NA)

  thousands <- seq(1000,dim(t20d)[1],1000)


  wavedata20 <- list()

  for (i in 1:21)
    {
      topic_ts <- subset(t20d,topic==to.lab.20[i])
      wavedata20[[i]] <- waveanalyzer(x=topic_ts)
      print(i)
      flush.console()
    }

  wd20 <- wavedata20[[1]]

  for (i in 2:21)
    {
      wd20 <- rbind(wd20,wavedata20[[i]])
      print(i)
      flush.console()
    }



##### Create moving averages for weeks, 14-days, months, quarters, semi-annual, annual, biannual, quinquennial

to.lab.50 <- unique(to.lab$topic50)[-6]

  for (i in 1:161)
    {
      t50d[t50d$topic==to.lab.50[i],"i.share.7"] <- rollmean(k=7,x=t50d[t50d$topic==to.lab.50[i],"share0"],align="center",fill=c(NA,NA,NA))
      t50d[t50d$topic==to.lab.50[i],"i.share.14"] <- rollmean(k=14,x=t50d[t50d$topic==to.lab.50[i],"share0"],align="center",fill=c(NA,NA,NA))
      t50d[t50d$topic==to.lab.50[i],"i.share.30"] <- rollmean(k=30,x=t50d[t50d$topic==to.lab.50[i],"share0"],align="center",fill=c(NA,NA,NA))
      t50d[t50d$topic==to.lab.50[i],"i.share.90"] <- rollmean(k=90,x=t50d[t50d$topic==to.lab.50[i],"share0"],align="center",fill=c(NA,NA,NA))
      t50d[t50d$topic==to.lab.50[i],"i.share.180"] <- rollmean(k=180,x=t50d[t50d$topic==to.lab.50[i],"share0"],align="center",fill=c(NA,NA,NA))
      t50d[t50d$topic==to.lab.50[i],"i.share.365"] <- rollmean(k=365,x=t50d[t50d$topic==to.lab.50[i],"share0"],align="center",fill=c(NA,NA,NA))
      t50d[t50d$topic==to.lab.50[i],"i.share.730"] <- rollmean(k=730,x=t50d[t50d$topic==to.lab.50[i],"share0"],align="center",fill=c(NA,NA,NA))
      t50d[t50d$topic==to.lab.50[i],"i.share.1825"] <- rollmean(k=1825,x=t50d[t50d$topic==to.lab.50[i],"share0"],align="center",fill=c(NA,NA,NA))
      print(i)
      flush.console()
    }

  for (i in 1:161)
    {
      t50d[t50d$topic==to.lab.50[i],"i.count.7"] <- rollmean(k=7,x=t50d[t50d$topic==to.lab.50[i],"count0"],align="center",fill=c(NA,NA,NA))
      t50d[t50d$topic==to.lab.50[i],"i.count.14"] <- rollmean(k=14,x=t50d[t50d$topic==to.lab.50[i],"count0"],align="center",fill=c(NA,NA,NA))
      t50d[t50d$topic==to.lab.50[i],"i.count.30"] <- rollmean(k=30,x=t50d[t50d$topic==to.lab.50[i],"count0"],align="center",fill=c(NA,NA,NA))
      t50d[t50d$topic==to.lab.50[i],"i.count.90"] <- rollmean(k=90,x=t50d[t50d$topic==to.lab.50[i],"count0"],align="center",fill=c(NA,NA,NA))
      t50d[t50d$topic==to.lab.50[i],"i.count.180"] <- rollmean(k=180,x=t50d[t50d$topic==to.lab.50[i],"count0"],align="center",fill=c(NA,NA,NA))
      t50d[t50d$topic==to.lab.50[i],"i.count.365"] <- rollmean(k=365,x=t50d[t50d$topic==to.lab.50[i],"count0"],align="center",fill=c(NA,NA,NA))
      t50d[t50d$topic==to.lab.50[i],"i.count.730"] <- rollmean(k=730,x=t50d[t50d$topic==to.lab.50[i],"count0"],align="center",fill=c(NA,NA,NA))
      t50d[t50d$topic==to.lab.50[i],"i.count.1825"] <- rollmean(k=1825,x=t50d[t50d$topic==to.lab.50[i],"count0"],align="center",fill=c(NA,NA,NA))
      print(i)
      flush.console()
    }

  for (i in 1:161)
  {
    t50d[t50d$topic==to.lab.50[i],"i.count.7"][1:42999] <- imputeTS::na_locf(t50d[t50d$topic==to.lab.50[i],"i.count.7"][1:42999],option="nocb") 
    t50d[t50d$topic==to.lab.50[i],"i.count.7"][43000:86197] <- imputeTS::na_locf(t50d[t50d$topic==to.lab.50[i],"i.count.7"][43000:86197],option="locf") 
    t50d[t50d$topic==to.lab.50[i],"i.count.14"][1:42999] <- imputeTS::na_locf(t50d[t50d$topic==to.lab.50[i],"i.count.14"][1:42999],option="nocb") 
    t50d[t50d$topic==to.lab.50[i],"i.count.14"][43000:86197] <- imputeTS::na_locf(t50d[t50d$topic==to.lab.50[i],"i.count.14"][43000:86197],option="locf") 
    t50d[t50d$topic==to.lab.50[i],"i.count.30"][1:42999] <- imputeTS::na_locf(t50d[t50d$topic==to.lab.50[i],"i.count.30"][1:42999],option="nocb") 
    t50d[t50d$topic==to.lab.50[i],"i.count.30"][43000:86197] <- imputeTS::na_locf(t50d[t50d$topic==to.lab.50[i],"i.count.30"][43000:86197],option="locf") 
    t50d[t50d$topic==to.lab.50[i],"i.count.90"][1:42999] <- imputeTS::na_locf(t50d[t50d$topic==to.lab.50[i],"i.count.90"][1:42999],option="nocb") 
    t50d[t50d$topic==to.lab.50[i],"i.count.90"][43000:86197] <- imputeTS::na_locf(t50d[t50d$topic==to.lab.50[i],"i.count.90"][43000:86197],option="locf") 
    t50d[t50d$topic==to.lab.50[i],"i.count.180"][1:42999] <- imputeTS::na_locf(t50d[t50d$topic==to.lab.50[i],"i.count.180"][1:42999],option="nocb") 
    t50d[t50d$topic==to.lab.50[i],"i.count.180"][43000:86197] <- imputeTS::na_locf(t50d[t50d$topic==to.lab.50[i],"i.count.180"][43000:86197],option="locf") 
    t50d[t50d$topic==to.lab.50[i],"i.count.365"][1:42999] <- imputeTS::na_locf(t50d[t50d$topic==to.lab.50[i],"i.count.365"][1:42999],option="nocb") 
    t50d[t50d$topic==to.lab.50[i],"i.count.365"][43000:86197] <- imputeTS::na_locf(t50d[t50d$topic==to.lab.50[i],"i.count.365"][43000:86197],option="locf") 
    t50d[t50d$topic==to.lab.50[i],"i.count.730"][1:42999] <- imputeTS::na_locf(t50d[t50d$topic==to.lab.50[i],"i.count.730"][1:42999],option="nocb") 
    t50d[t50d$topic==to.lab.50[i],"i.count.730"][43000:86197] <- imputeTS::na_locf(t50d[t50d$topic==to.lab.50[i],"i.count.730"][43000:86197],option="locf") 
    t50d[t50d$topic==to.lab.50[i],"i.count.1825"][1:42999] <- imputeTS::na_locf(t50d[t50d$topic==to.lab.50[i],"i.count.1825"][1:42999],option="nocb") 
    t50d[t50d$topic==to.lab.50[i],"i.count.1825"][43000:86197] <- imputeTS::na_locf(t50d[t50d$topic==to.lab.50[i],"i.count.1825"][43000:86197],option="locf") 
  print(i)
  flush.console()
  }



##### Is the 14day moving average above the longer term averages 30d, 90d, 180d, 365d, 730d, 1825d?
  t50d$a30 <- 1*(t50d$i.count.30>t50d$i.count.30)
  t50d$a90 <- 1*(t50d$i.count.30>t50d$i.count.90)
  t50d$a180 <- 1*(t50d$i.count.30>t50d$i.count.180)
  t50d$a365 <- 1*(t50d$i.count.30>t50d$i.count.365)
  t50d$a730 <- 1*(t50d$i.count.30>t50d$i.count.730)
  t50d$a1825 <- 1*(t50d$i.count.30>t50d$i.count.1825)
##### Sum of the long-term averages the 14day moving average exceeds (0-6)
  t50d$a <- t50d$a90+t50d$a180+t50d$a365+t50d$a730+t50d$a1825
##### 1-day time lagged version of a, to see when the number of exceedings increases from 0 to 1 or higher.
  t50d$a.lag1 <- c(NA,t50d$a[-(length(t50d$a)-1)])
##### Check if the number of long-term averages that are exceeded turns greater than 0, or moves from (greater than 0) to 0.
  t50d$new.topic <- 0
  t50d$new.topic[seq(1,length(t50d$new.topic),86197)] <- 1
  t50d$turn.on <- 1*(t50d$a.lag1<1 & t50d$a>0)
  t50d$turn.off <- 1*(t50d$a<1 & t50d$a.lag1>0)
  t50d$turn.on <- ifelse(is.na(t50d$turn.on),0,t50d$turn.on)
##### Potential news wave active?
  t50d$pnw <- (t50d$turn.on==1 | t50d$a>2)
##### Running number of potential news wave
  t50d$pnw.no <- cumsum(t50d$turn.on)
  t50d$a.pnw.no <- (t50d$a>2)*t50d$pnw.no
##### Above-average coverage shares
  t50d$v30 <- (t50d$a30)*(t50d$i.share.30-t50d$i.share.30) 
  t50d$v30 <- (t50d$a30)*(t50d$i.share.30-t50d$i.share.30) 
  t50d$v90 <- (t50d$a90)*(t50d$i.share.30-t50d$i.share.90) 
  t50d$v180 <- (t50d$a180)*(t50d$i.share.30-t50d$i.share.180) 
  t50d$v365 <- (t50d$a365)*(t50d$i.share.30-t50d$i.share.365) 
  t50d$v730 <- (t50d$a730)*(t50d$i.share.30-t50d$i.share.730) 
  t50d$v1825 <- (t50d$a1825)*(t50d$i.share.30-t50d$i.share.1825) 
##### Above-average coverage count
  t50d$n30 <- (t50d$a30)*(t50d$i.count.30-t50d$i.count.30) 
  t50d$n90 <- (t50d$a90)*(t50d$i.count.30-t50d$i.count.90) 
  t50d$n180 <- (t50d$a180)*(t50d$i.count.30-t50d$i.count.180) 
  t50d$n365 <- (t50d$a365)*(t50d$i.count.30-t50d$i.count.365) 
  t50d$n730 <- (t50d$a730)*(t50d$i.count.30-t50d$i.count.730) 
  t50d$n1825 <- (t50d$a1825)*(t50d$i.count.30-t50d$i.count.1825) 

  t50d$wv30 <- 0
  t50d$wv90 <- 0
  t50d$wv180 <- 0
  t50d$wv365 <- 0
  t50d$wv730 <- 0
  t50d$wv1825 <- 0
  t50d$wn30 <- 0
  t50d$wn90 <- 0
  t50d$wn180 <- 0
  t50d$wn365 <- 0
  t50d$wn730 <- 0
  t50d$wn1825 <- 0


  t50d$a.pnw.no <- replace(t50d$a.pnw.no,t50d$a.pnw.no==0,NA)

  thousands <- seq(1000,dim(t50d)[1],1000)



wavedata50 <- list()

  for (i in 1:161)
    {
      topic_ts <- subset(t50d,topic==to.lab.50[i])
      wavedata50[[i]] <- waveanalyzer(x=topic_ts)
      print(i)
      flush.console()
    }

wd50 <- wavedata50[[1]]

  for (i in 2:161)
    {
      wd50 <- rbind(wd50,wavedata50[[i]])
      print(i)
      flush.console()
    }


  wd50x <- subset(wd50,volume>2.5)

  wd50x$start.date <- as.Date(wd50x$start,origin="1784-12-31")

  newsvolume <- data.frame(year=thetimes$year,articles=thetimes$articles)
  newsvolume$threshold <- newsvolume$articles*0.0004
  newsvolume$threshold2 <- ifelse(newsvolume$threshold>1,newsvolume$threshold,1)
  wd50$year <- substring(as.Date(as.POSIXct(wd50$start*60*60*24,origin="1784-12-31"),format="%Y"),1,4)

  wd50$thresh <- newsvolume[match(wd50$year,newsvolume$year),"threshold"]
  wd50$thresh2 <- ifelse(wd50$thresh>5,wd50$thresh,5)

  wd50x <- subset(wd50,volume>thresh2/5)

  ggplot(wd50x,aes(x=as.numeric(year),y=volume+baseline30*duration))+geom_point()+theme_bluewhite()+scale_x_continuous(breaks=seq(1780,2020,20))+geom_smooth()
  ggplot(wd50x,aes(x=as.numeric(year),y=duration))+geom_point()+theme_bluewhite()+scale_x_continuous(breaks=seq(1780,2020,20))+geom_smooth()
  ggplot(wd50x,aes(x=as.numeric(year),y=intensity))+geom_point()+theme_bluewhite()+scale_x_continuous(breaks=seq(1780,2020,20))+geom_smooth()

  ggplot(data.frame(table(wd50x$year)),aes(x=as.numeric(Var1),y=Freq,label=Var1))+geom_text()+theme_bluewhite()+geom_smooth()


##### Create moving averages for weeks, 14-days, months, quarters, semi-annual, annual, biannual, quinquennial


  alldays <- 1:86197
  missdays <- (alldays%in%to250_day$day[1:44074])
  misseddays <- alldays[missdays==FALSE]


  to250_addday <- data.frame(day=rep(misseddays,times=250),topic=rep(names(table(to.lab$topic250)),each=length(misseddays)),count=0,total.count=0,share=NA)

  to250d <- rbind(to250_day,to250_addday)

  sorter <- order(to250d$topic,to250d$day)

  t250d <- to250d[sorter,]

  count100k <- seq(0,dim(t250d)[1],100000)
  count1k <- seq(0,dim(t250d)[1],1000)

  t250d$share0 <- replace(t250d$share,is.na(t250d$share),0)
  t250d$i.share <- LOCF(t250d$share)

  t250d$count0 <- replace(t250d$count,is.na(t250d$count),0)



  to.lab.250 <- unique(to.lab$topic250)

  for (i in 1:250)
    {
      t250d[t250d$topic==to.lab.250[i],"i.share.7"] <- rollmean(k=7,x=t250d[t250d$topic==to.lab.250[i],"share0"],align="center",fill=c(NA,NA,NA))
      t250d[t250d$topic==to.lab.250[i],"i.share.14"] <- rollmean(k=14,x=t250d[t250d$topic==to.lab.250[i],"share0"],align="center",fill=c(NA,NA,NA))
      t250d[t250d$topic==to.lab.250[i],"i.share.30"] <- rollmean(k=30,x=t250d[t250d$topic==to.lab.250[i],"share0"],align="center",fill=c(NA,NA,NA))
      t250d[t250d$topic==to.lab.250[i],"i.share.90"] <- rollmean(k=90,x=t250d[t250d$topic==to.lab.250[i],"share0"],align="center",fill=c(NA,NA,NA))
      t250d[t250d$topic==to.lab.250[i],"i.share.180"] <- rollmean(k=180,x=t250d[t250d$topic==to.lab.250[i],"share0"],align="center",fill=c(NA,NA,NA))
      t250d[t250d$topic==to.lab.250[i],"i.share.365"] <- rollmean(k=365,x=t250d[t250d$topic==to.lab.250[i],"share0"],align="center",fill=c(NA,NA,NA))
      t250d[t250d$topic==to.lab.250[i],"i.share.730"] <- rollmean(k=730,x=t250d[t250d$topic==to.lab.250[i],"share0"],align="center",fill=c(NA,NA,NA))
      t250d[t250d$topic==to.lab.250[i],"i.share.1825"] <- rollmean(k=1825,x=t250d[t250d$topic==to.lab.250[i],"share0"],align="center",fill=c(NA,NA,NA))
      print(i)
      flush.console()
    }

  for (i in 1:250)
    {
      t250d[t250d$topic==to.lab.250[i],"i.count.7"] <- rollmean(k=7,x=t250d[t250d$topic==to.lab.250[i],"count0"],align="center",fill=c(NA,NA,NA))
      t250d[t250d$topic==to.lab.250[i],"i.count.14"] <- rollmean(k=14,x=t250d[t250d$topic==to.lab.250[i],"count0"],align="center",fill=c(NA,NA,NA))
      t250d[t250d$topic==to.lab.250[i],"i.count.30"] <- rollmean(k=30,x=t250d[t250d$topic==to.lab.250[i],"count0"],align="center",fill=c(NA,NA,NA))
      t250d[t250d$topic==to.lab.250[i],"i.count.90"] <- rollmean(k=90,x=t250d[t250d$topic==to.lab.250[i],"count0"],align="center",fill=c(NA,NA,NA))
      t250d[t250d$topic==to.lab.250[i],"i.count.180"] <- rollmean(k=180,x=t250d[t250d$topic==to.lab.250[i],"count0"],align="center",fill=c(NA,NA,NA))
      t250d[t250d$topic==to.lab.250[i],"i.count.365"] <- rollmean(k=365,x=t250d[t250d$topic==to.lab.250[i],"count0"],align="center",fill=c(NA,NA,NA))
      t250d[t250d$topic==to.lab.250[i],"i.count.730"] <- rollmean(k=730,x=t250d[t250d$topic==to.lab.250[i],"count0"],align="center",fill=c(NA,NA,NA))
      t250d[t250d$topic==to.lab.250[i],"i.count.1825"] <- rollmean(k=1825,x=t250d[t250d$topic==to.lab.250[i],"count0"],align="center",fill=c(NA,NA,NA))
      print(i)
      flush.console()
    }

  for (i in 1:250)
    {
      t250d[t250d$topic==to.lab.250[i],"i.count.7"][1:42999] <- imputeTS::na_locf(t250d[t250d$topic==to.lab.250[i],"i.count.7"][1:42999],option="nocb") 
      t250d[t250d$topic==to.lab.250[i],"i.count.7"][43000:86197] <- imputeTS::na_locf(t250d[t250d$topic==to.lab.250[i],"i.count.7"][43000:86197],option="locf") 
      t250d[t250d$topic==to.lab.250[i],"i.count.14"][1:42999] <- imputeTS::na_locf(t250d[t250d$topic==to.lab.250[i],"i.count.14"][1:42999],option="nocb") 
      t250d[t250d$topic==to.lab.250[i],"i.count.14"][43000:86197] <- imputeTS::na_locf(t250d[t250d$topic==to.lab.250[i],"i.count.14"][43000:86197],option="locf") 
      t250d[t250d$topic==to.lab.250[i],"i.count.30"][1:42999] <- imputeTS::na_locf(t250d[t250d$topic==to.lab.250[i],"i.count.30"][1:42999],option="nocb") 
      t250d[t250d$topic==to.lab.250[i],"i.count.30"][43000:86197] <- imputeTS::na_locf(t250d[t250d$topic==to.lab.250[i],"i.count.30"][43000:86197],option="locf") 
      t250d[t250d$topic==to.lab.250[i],"i.count.90"][1:42999] <- imputeTS::na_locf(t250d[t250d$topic==to.lab.250[i],"i.count.90"][1:42999],option="nocb") 
      t250d[t250d$topic==to.lab.250[i],"i.count.90"][43000:86197] <- imputeTS::na_locf(t250d[t250d$topic==to.lab.250[i],"i.count.90"][43000:86197],option="locf") 
      t250d[t250d$topic==to.lab.250[i],"i.count.180"][1:42999] <- imputeTS::na_locf(t250d[t250d$topic==to.lab.250[i],"i.count.180"][1:42999],option="nocb") 
      t250d[t250d$topic==to.lab.250[i],"i.count.180"][43000:86197] <- imputeTS::na_locf(t250d[t250d$topic==to.lab.250[i],"i.count.180"][43000:86197],option="locf") 
      t250d[t250d$topic==to.lab.250[i],"i.count.365"][1:42999] <- imputeTS::na_locf(t250d[t250d$topic==to.lab.250[i],"i.count.365"][1:42999],option="nocb") 
      t250d[t250d$topic==to.lab.250[i],"i.count.365"][43000:86197] <- imputeTS::na_locf(t250d[t250d$topic==to.lab.250[i],"i.count.365"][43000:86197],option="locf") 
      t250d[t250d$topic==to.lab.250[i],"i.count.730"][1:42999] <- imputeTS::na_locf(t250d[t250d$topic==to.lab.250[i],"i.count.730"][1:42999],option="nocb") 
      t250d[t250d$topic==to.lab.250[i],"i.count.730"][43000:86197] <- imputeTS::na_locf(t250d[t250d$topic==to.lab.250[i],"i.count.730"][43000:86197],option="locf") 
      t250d[t250d$topic==to.lab.250[i],"i.count.1825"][1:42999] <- imputeTS::na_locf(t250d[t250d$topic==to.lab.250[i],"i.count.1825"][1:42999],option="nocb") 
      t250d[t250d$topic==to.lab.250[i],"i.count.1825"][43000:86197] <- imputeTS::na_locf(t250d[t250d$topic==to.lab.250[i],"i.count.1825"][43000:86197],option="locf") 
      print(i)
      flush.console()
    }



##### Is the 14day moving average above the longer term averages 30d, 90d, 180d, 365d, 730d, 1825d?
  t250d$a30 <- 1*(t250d$i.count.30>t250d$i.count.30)
  t250d$a90 <- 1*(t250d$i.count.30>t250d$i.count.90)
  t250d$a180 <- 1*(t250d$i.count.30>t250d$i.count.180)
  t250d$a365 <- 1*(t250d$i.count.30>t250d$i.count.365)
  t250d$a730 <- 1*(t250d$i.count.30>t250d$i.count.730)
  t250d$a1825 <- 1*(t250d$i.count.30>t250d$i.count.1825)
##### Sum of the long-term averages the 14day moving average exceeds (0-6)
  t250d$a <- t250d$a90+t250d$a180+t250d$a365+t250d$a730+t250d$a1825
##### 1-day time lagged version of a, to see when the number of exceedings increases from 0 to 1 or higher.
  t250d$a.lag1 <- c(NA,t250d$a[-(length(t250d$a)-1)])
##### Check if the number of long-term averages that are exceeded turns greater than 0, or moves from (greater than 0) to 0.
  t250d$new.topic <- 0
  t250d$new.topic[seq(1,length(t250d$new.topic),86197)] <- 1
  t250d$turn.on <- 1*(t250d$a.lag1<1 & t250d$a>0)
  t250d$turn.off <- 1*(t250d$a<1 & t250d$a.lag1>0)
  t250d$turn.on <- ifelse(is.na(t250d$turn.on),0,t250d$turn.on)
##### Potential news wave active?
  t250d$pnw <- (t250d$turn.on==1 | t250d$a>2)
##### Running number of potential news wave
  t250d$pnw.no <- cumsum(t250d$turn.on)
  t250d$a.pnw.no <- (t250d$a>2)*t250d$pnw.no
##### Above-average coverage shares
  t250d$v30 <- (t250d$a30)*(t250d$i.share.30-t250d$i.share.30) 
  t250d$v30 <- (t250d$a30)*(t250d$i.share.30-t250d$i.share.30) 
  t250d$v90 <- (t250d$a90)*(t250d$i.share.30-t250d$i.share.90) 
  t250d$v180 <- (t250d$a180)*(t250d$i.share.30-t250d$i.share.180) 
  t250d$v365 <- (t250d$a365)*(t250d$i.share.30-t250d$i.share.365) 
  t250d$v730 <- (t250d$a730)*(t250d$i.share.30-t250d$i.share.730) 
  t250d$v1825 <- (t250d$a1825)*(t250d$i.share.30-t250d$i.share.1825) 
##### Above-average coverage count
  t250d$n30 <- (t250d$a30)*(t250d$i.count.30-t250d$i.count.30) 
  t250d$n90 <- (t250d$a90)*(t250d$i.count.30-t250d$i.count.90) 
  t250d$n180 <- (t250d$a180)*(t250d$i.count.30-t250d$i.count.180) 
  t250d$n365 <- (t250d$a365)*(t250d$i.count.30-t250d$i.count.365) 
  t250d$n730 <- (t250d$a730)*(t250d$i.count.30-t250d$i.count.730) 
  t250d$n1825 <- (t250d$a1825)*(t250d$i.count.30-t250d$i.count.1825) 

  t250d$wv30 <- 0
  t250d$wv90 <- 0
  t250d$wv180 <- 0
  t250d$wv365 <- 0
  t250d$wv730 <- 0
  t250d$wv1825 <- 0
  t250d$wn30 <- 0
  t250d$wn90 <- 0
  t250d$wn180 <- 0
  t250d$wn365 <- 0
  t250d$wn730 <- 0
  t250d$wn1825 <- 0


  t250d$a.pnw.no <- replace(t250d$a.pnw.no,t250d$a.pnw.no==0,NA)

  thousands <- seq(1000,dim(t250d)[1],1000)

  wavedata250 <- list()

  for (i in 1:250){
    topic_ts <- subset(t250d,topic==to.lab.250[i])
    wavedata250[[i]] <- waveanalyzer(x=topic_ts)
    print(i)
    flush.console()
  }

  wd250 <- wavedata250[[1]]

  for (i in 2:250){
    wd250 <- rbind(wd250,wavedata250[[i]])
    print(i)
    flush.console()
    }

  wd250$baseline.volume <- wd250$duration*rowMeans(cbind(wd250[,c("baseline90", "baseline180", "baseline365", "baseline730", "baseline1825")]))

  wd250x <- subset(wd250,volume>2.5)

  wd250x$start.date <- as.Date(wd250x$start,origin="1784-12-31")

  newsvolume <- data.frame(year=thetimes$year,articles=thetimes$articles)
  newsvolume$threshold <- newsvolume$articles*0.0004
  newsvolume$threshold2 <- ifelse(newsvolume$threshold>1,newsvolume$threshold,1)
  wd250$year <- substring(as.Date(as.POSIXct(wd250$start*60*60*24,origin="1784-12-31"),format="%Y"),1,4)

  wd250$thresh <- newsvolume[match(wd250$year,newsvolume$year),"threshold"]
  wd250$thresh2 <- ifelse(wd250$thresh>5,wd250$thresh,5)

  wd250x <- subset(wd250,(volume+baseline.volume)>thresh2/2)

  ggplot(wd250x,aes(x=as.numeric(year),y=volume+baseline30*duration))+geom_point()+theme_bluewhite()+scale_x_continuous(breaks=seq(1780,2020,20))+geom_smooth()
  ggplot(wd250x,aes(x=as.numeric(year),y=duration))+geom_point()+theme_bluewhite()+scale_x_continuous(breaks=seq(1780,2020,20))+geom_smooth()
  ggplot(wd250x,aes(x=as.numeric(year),y=(intensity+baseline30)))+geom_point()+theme_bluewhite()+scale_x_continuous(breaks=seq(1780,2020,20))+geom_smooth()+ylim(0,1)

  ggplot(data.frame(table(wd250x$year)),aes(x=as.numeric(Var1),y=Freq,label=Var1))+geom_text()+theme_bluewhite()+geom_smooth()

  # save(t250d,file="t250x.RData")
  # save(t50d,file="t50x.RData")
  # save(t20d,file="t20x.RData")



  # save(wd250x,file="wd250x.RData")
  # save(wd50x,file="wd50x.RData")
  # save(wd20x,file="wd20x.RData")


  for (j in 1:250)
  {
      for (i in 2:86197)
      {
        cd <- t250dl[[j]][i,]
        pd <- t250dl[[j]][(i-1),]
        cd[,c("wv30", "wv90", "wv180", "wv365", "wv730", "wv1825")] <- c((1-cd$new.topic)*(pd$wv30+cd$v30)*(1-cd$turn.off),
            (1-cd$new.topic)*(pd$wv90+cd$v90)*(1-cd$turn.off),
            (1-cd$new.topic)*(pd$wv180+cd$v180)*(1-cd$turn.off),
            (1-cd$new.topic)*(pd$wv365+cd$v365)*(1-cd$turn.off),
            (1-cd$new.topic)*(pd$wv730+cd$v730)*(1-cd$turn.off),
            (1-cd$new.topic)*(pd$wv1825+cd$v1825)*(1-cd$turn.off))
        t250dl[[j]][i,] <- cd
      }
    print(j)
    flush.console()
  }


  for (i in 2:dim(t250d)[1])
  {
    for (i in 2:10000)
    {
      t250d[i,"wv30"] <- (1-(t250d[i,"new.topic"]==1))*(t250d[i,"v30"]+t250d[i-1,"wv30"])*(1-t250d$turn.off[i])
      t250d[i,"wv90"] <- (1-(t250d[i,"new.topic"]==1))*(t250d[i,"v90"]+t250d[i-1,"wv90"])*(1-t250d$turn.off[i])
      t250d[i,"wv180"] <- (1-(t250d[i,"new.topic"]==1))*(t250d[i,"v180"]+t250d[i-1,"wv180"])*(1-t250d$turn.off[i])
      t250d[i,"wv365"] <- (1-(t250d[i,"new.topic"]==1))*(t250d[i,"v365"]+t250d[i-1,"wv365"])*(1-t250d$turn.off[i])
      t250d[i,"wv730"] <- (1-(t250d[i,"new.topic"]==1))*(t250d[i,"v730"]+t250d[i-1,"wv730"])*(1-t250d$turn.off[i])
      t250d[i,"wv1825"] <- (1-(t250d[i,"new.topic"]==1))*(t250d[i,"v1825"]+t250d[i-1,"wv1825"])*(1-t250d$turn.off[i])
      t250d[i,"wn30"] <- (1-(t250d[i,"new.topic"]==1))*(t250d[i,"n30"]+t250d[i-1,"wn30"])*(1-t250d$turn.off[i])
      t250d[i,"wn90"] <- (1-(t250d[i,"new.topic"]==1))*(t250d[i,"n90"]+t250d[i-1,"wn90"])*(1-t250d$turn.off[i])
      t250d[i,"wn180"] <- (1-(t250d[i,"new.topic"]==1))*(t250d[i,"n180"]+t250d[i-1,"wn180"])*(1-t250d$turn.off[i])
      t250d[i,"wn365"] <- (1-(t250d[i,"new.topic"]==1))*(t250d[i,"n365"]+t250d[i-1,"wn365"])*(1-t250d$turn.off[i])
      t250d[i,"wn730"] <- (1-(t250d[i,"new.topic"]==1))*(t250d[i,"n730"]+t250d[i-1,"wn730"])*(1-t250d$turn.off[i])
      t250d[i,"wn1825"] <- (1-(t250d[i,"new.topic"]==1))*(t250d[i,"n1825"]+t250d[i-1,"wn1825"])*(1-t250d$turn.off[i])
    }
  }

  pnw.counter <- (1:max(t250d$pnw.no,na.rm=TRUE))

# for (i in 1:length(pnw.counter)){
# cd <- 
# }

  ggplot(subset(t250d,topic==to.lab$topic250[1]),aes(x=1785+(day/365.25)))+geom_point(aes(y=i.share),size=0.2)+
  geom_line(aes(y=i.share.30),color="brown")+geom_line(aes(y=i.share.365),color="green")

  ggplot(subset(t250d,topic==to.lab$topic250[1]),aes(x=1785+(day/365.25)))+
  geom_line(aes(y=i.share.30),color="brown")+geom_line(aes(y=i.share.1825),color="green")

  t250d$i.share.7  <- rollmean(t250d$i.share,k=7,align="right")

  t250d[t250d$day==1,"share"] <- 0

  for (i in 1:5000)
  {
    if(is.na(t250d$share[i])){
    t250d[i,"share"] <- t250d[i-1,"share"]}
    if(i%in%count1k){flush.console(); print(i)}
  }

  daybyday <- data.frame(day=1:85831,count=0,total.count=0,share=0,topic="none")

  to <- subset(to250_day,topic==to.lab[i,"topic250"])

  daybyday[,c("count", "total.count", "share")] <- to[match(daybyday$day,to$day)]

  long.STM$Month <- NA

  long.STM$Month <- Recode(long.STM$month,"'January'=1;'February'=2;'March'=3;'April'=4;
    'May'=5;'June'=6;'July'=7;'August'=8;
    'September'=9;'October'=10;'November'=11;'December'=12")

  x <- as.numeric(str_extract(long.STM$date,pattern="[:digit:]{1,2}"))
  long.STM$Day <- x
  y <- paste0(long.STM$year,"-",long.STM$Month,"-",long.STM$Day)
  z <- (long.STM$year>2014)
  long.STM[z,"Time"] <- y[z]

  dtm$Month <- NA

  dtm$Month <- Recode(dtm$month,"'January'=1;'February'=2;'March'=3;'April'=4;
    'May'=5;'June'=6;'July'=7;'August'=8;
    'September'=9;'October'=10;'November'=11;'December'=12")

  dtm$Year <- as.numeric(str_extract(dtm$date,pattern="[:digit:]{4}"))

  x <- as.numeric(str_extract(dtm$date,pattern="[:digit:]{1,2}"))
  dtm$Day <- x
  y <- paste0(dtm$Year,"-",dtm$Month,"-",dtm$Day)
  z <- (dtm$Year>2014)
  dtm$Time[z] <- y[z]


  wide.STM$Month <- NA

  wide.STM$Month <- Recode(wide.STM$month,"'January'=1;'February'=2;'March'=3;'April'=4;
    'May'=5;'June'=6;'July'=7;'August'=8;
    'September'=9;'October'=10;'November'=11;'December'=12")

  wide.STM$Year <- as.numeric(str_extract(wide.STM$date,pattern="[:digit:]{4}"))

  x <- as.numeric(str_extract(wide.STM$date,pattern="[:digit:]{1,2}"))
  wide.STM$Day <- x
  y <- paste0(wide.STM$Year,"-",wide.STM$Month,"-",wide.STM$Day)
  z <- (wide.STM$Year>2014)
  wide.STM$Time[z] <- y[z]
        

  long.STM$Date <- as.Date(long.STM$Time,format="%Y-%m-%d") # We create a date for the articles in the YYYY-MM-DD format.

  long.STM$days <- as.numeric(as.POSIXct(long.STM$Date))/60/60/24+719528    # Convert to "days since 1 Jan 0 (=1 Jan 1 BC)". I call this the "Jesus date format" (no cultural offense intended, I just found it very memorable). This is useful to have a continuous measure of time in days. 

  wide.STM$days <- as.numeric(as.POSIXct(wide.STM$Time,format="%Y-%m-%d"))/60/60/24+719528  # The same for the "wide" representation of the topics. Jesus date.

  dtm$days <- as.numeric(as.POSIXct(dtm$Time,format="%Y-%m-%d"))/60/60/24+719528

  # schaltjahre <- seq(1788,2020,4)                                         # Lines 20-40: An alternative way of converting to "days since 1 Jan 0". Leads to deviations of a few days, probably imperfect understanding of Gregorian calendar? So I rely on as.POXICct, rather.
  # schaltjahre <- schaltjahre[c(-4,-29,-54)]

  # nichtschalt <- c(1,-1,0,0,1,1,2,3,3,4,4,5)
  # schalt <- c(1,0,1,1,2,2,3,4,4,5,5,6)

  # minday <- ifelse(1786%in%schaltjahre,
  #         1786*365+1*30+1+floor(1786/4)-floor(1786/100),
  #         1786*365+1*30+1+floor(1786/4)-floor(1786/100))

  #long.STM$days <- ifelse(as.numeric(long.STM$year)%in%schaltjahre,
  #             as.numeric(long.STM$year)*365+(long.STM$Month-1)*30+as.numeric(long.STM$day)+
  #             schalt[long.STM$Month]+
  #             floor(as.numeric(long.STM$year)/4)*1-                       # Julianische Modifikation
  #             floor(as.numeric(long.STM$year)/100)*1,                 # Gregorianische Modifikation
  #             as.numeric(long.STM$year)*365+(long.STM$Month-1)*30+as.numeric(long.STM$day)+
  #             nichtschalt[long.STM$Month]+
  #             floor(as.numeric(long.STM$year)/4)*1-                       # Julianische Modifikation
  #             floor(as.numeric(long.STM$year)/100)*1)                 # Gregorianische Modifikation

  # as.numeric(as.POSIXct(long.STM$days[1:100]*24*60*60,origin="0000-01-01",tz="UTC")))

  days_to_date <- cbind(names(table(long.STM$days)),names(table(long.STM$Date))) # A table that converts dates into "Jesus date". Format exchanger.

  issues <- levels(long.STM$issue)[1:120] # Names of all topics are put in a list.

  tss <- list() # Creates a bin in which to store the time series that include the event counters and the criteria used for filtering significant events from potential events

  ### On a moderately fast personal computer, the 

  for (i in 106:120) # Procedure is conducted for all topics, one by one.
  {
  # Create a fully/truly continuous time series for article count.
    articles <- subset(long.STM,issue==issues[i]) # reduces the data to the topic that is currently analyzed.
    daybyday <- aggregate(articles$pr,by=list(Time=articles$days),FUN="sum") # Creates a time series. The articles are aggregated on a daily basis.
    startday <- min(daybyday$Time,na.rm=TRUE)               # minday The first day of the period of study. Jesus date.
    finalday <- max(daybyday$Time,na.rm=TRUE)               # The last day in the period of study. Jesus date.
    dbd <- startday:finalday                            # A list of all days included in the period of study. This is to ensure that the time series also includes the "empty" days and is fully continuous. Jesus date.
    timeseries <- matrix(ncol=2,nrow=length(dbd),0)     # Creates a time series matrix for the topic-at-hand. This ensures that we have all the missing dates in it as well.
    timeseries[,1] <- dbd                               # Gives a Jesus date to every time cell in the time series
    match.days <- match(daybyday[,"Time"],dbd)          # A pointer that says which cell in the daybyday time series (daybyday, with some omitted dates) matches the entries in the continuous time series (ts)
    timeseries[match.days,2] <- daybyday[,"x"]          # Assigns the article number data (i.e. the sum of topic probabilities on the respective day for the respective topic) to the continuous time series
    ts <- data.frame(timeseries)                        # Creates a time series data.frame from the time series matrix. Just for ease of handling.
    names(ts) <- c("day", "n")                          # Gives useful names to variables in the continuous time series data frame (ts).        
  # Creates true rolling means (right-aligned) with different intervals. These (n7, n30) will serve as indicators of current coverage intensity to not give too much weight to single short-term peaks. A true crisis will be visible for a longer period of time in the time series.
  # "Longer" rolling means (n30,n90,n180,n365,n1461,n3652) serve as reference points what amount of coverage would be expected if no "event" was present. 
    ts$n7 <- rollmean(x=ts$n,k=7,align="right",fill=0)          # 7-day moving average (week)
    ts$n30 <- rollmean(x=ts$n,k=30,align="right",fill=0)        # 30-day moving average (month)
    ts$n90 <- rollmean(x=ts$n,k=90,align="right",fill=0)        # 90-day moving average (quarter)
    ts$n180 <- rollmean(x=ts$n,k=180,align="right",fill=0)      # 180-day moving average (half-year)
    ts$n365 <- rollmean(x=ts$n,k=365,align="right",fill=0)      # 365-day moving average (year)
    ts$n1461 <- rollmean(x=ts$n,k=1461,align="right",fill=0)    # 1461-day moving average (olympiad, 4-year)
    ts$n3652 <- rollmean(x=ts$n,k=3652,align="right",fill=0)    # 3652-day moving average (decade, 10-year)
    T <- dim(ts)[1]                                         # The number of days to be checked for potential events. The procedure will go through the time series day by day...
    ts$o7 <- c((ts$n7>ts$n30) + (ts$n7>ts$n90) + (ts$n7>ts$n180) + (ts$n7>ts$n365) + (ts$n7>ts$n1461) + (ts$n7>ts$n3652))    # How many of the criteria (n30, n90, n180, n365, n1461, n3652) does n7 surpass? All SIX means we might have an event here.
    ts$o7r <- c(ts$o7[2:length(ts$o7)],0)
    ts$o7f <- c(NA,ts$o7[1:(length(ts$o7)-1)])
    ts$o7c <- (ts$o7==6 & ts$o7==ts$o7r & ts$o7==ts$o7f)
    ts$o7dur <- 0
    ts$o7vol <- 0
    ts$o7int <- 0
    ts$o7sta <- NA
    ts$o7end <- NA
    ts$o7d30 <- 0
    ts$o7d90 <- 0
    ts$o7d180 <- 0
    ts$o7d365 <- 0
    ts$o7d1461 <- 0
    ts$o7d3652 <- 0
    ts$o7id <- 0
    ts$o30 <- c((ts$n30>ts$n90) + (ts$n30>ts$n180) + (ts$n30>ts$n180) + (ts$n30>ts$n365) + (ts$n30>ts$n1461) + (ts$n30>ts$n3652))    # How many of the criteria (n90, n180, n180, n365, n1461, n3652) does n7 surpass? All SIX means we might have an event here. I included n180 twice to get to the same number of criteria as in o7, rather than lowering the criterion to 5. But this does not make a difference.
    ts$o30r <- c(ts$o30[2:length(ts$o30)],0)
    ts$o30f <- c(NA,ts$o30[1:(length(ts$o30)-1)])
    ts$o30c <- (ts$o30==6 & ts$o30==ts$o30r & ts$o30==ts$o30f)
    ts$o30dur <- 0
    ts$o30vol <- 0
    ts$o30int <- 0
    ts$o30sta <- NA
    ts$o30end <- NA
    ts$o30d90 <- 0
    ts$o30d180 <- 0
    ts$o30d365 <- 0
    ts$o30d1461 <- 0
    ts$o30d3652 <- 0
    ts$o30id <- 0
      for (t in 2:T)
      {
      ts$o7dur[t] <- (ts$o7c[t])*(1+ts$o7dur[t-1])                          # a counter how many days in a row news attention has been above all averages (30day, 90day, 180day, 365day, 1461day, 3652day averages)
                                          # Goes back to zero as soon as news attention drops below one of the averages.
      ts$o7id[t]       <- if(ts$o7dur[t]==1) ts$o7id[t-1]+1 else ts$o7id[t-1]   # Assigns ID numbers to the current news wave
      if(ts$o7dur[t]>0) {                                                       # If news attention (7day average) is above all averages (potential "news wave"), then...
        ts$o7sta[t] <- if(ts$o7dur[t]==1) ts$day[t] else ts$o7sta[t-1]          # records the start day of the "news wave"
        ts$o7end[t] <- if(ts$o7dur[t+1]==0) ts$day[t] else NA                   # Checks whether this is the final day of the "news wave". If yes, it records the current day as end of the news wave.
        ts$o7d30[t] <-  ts$o7d30[t-1]+(ts$n7[t]-ts$n30[t])                      # Records how much the current news attention is above the 30day average.
        ts$o7d90[t] <- ts$o7d90[t-1]+(ts$n7[t]-ts$n90[t])                       # Records how much the current news attention is above the 90day average.
        ts$o7d180[t] <- ts$o7d180[t-1]+(ts$n7[t]-ts$n180[t])                    # Records how much the current news attention is above the 180day average.
        ts$o7d365[t] <- ts$o7d365[t-1]+(ts$n7[t]-ts$n365[t])                    # Records how much the current news attention is above the 365day average.
        ts$o7d1461[t] <- ts$o7d1461[t-1]+(ts$n7[t]-ts$n1461[t])                 # Records how much the current news attention is above the 1461day average.
        ts$o7d3652[t] <- ts$o7d3652[t-1]+(ts$n7[t]-ts$n3652[t])                 # Records how much the current news attention is above the 3652day average.
        }
      ts$o30dur[t] <- (ts$o30c[t])*(1+ts$o30dur[t-1])                           # a counter how many days in a row news attention has been above all averages (90day, 180day, 365day, 1461day, 3652day averages)
      ts$o30id[t]       <- if(ts$o30dur[t]==1) ts$o30id[t-1]+1 else ts$o30id[t-1]   # Assigns ID numbers to the current news wave
      if(ts$o30dur[t]>0) {                                                  # If news attention (7day average) is above all averages (potential "news wave"), then...
        ts$o30sta[t] <- if(ts$o30dur[t]==1) ts$day[t] else ts$o30sta[t-1]       # records the start day of the "news wave"
        ts$o30end[t] <- if(ts$o30dur[t+1]==0) ts$day[t] else NA                 # Checks whether this is the final day of the "news wave". If yes, it records the current day as end of the news wave.
        ts$o30d90[t] <- ts$o7d90[t-1]+(ts$n30[t]-ts$n90[t])                     # Records how much the current news attention is above the 90day average.
        ts$o30d180[t] <- ts$o7d180[t-1]+(ts$n30[t]-ts$n180[t])                  # Records how much the current news attention is above the 180day average.
        ts$o30d365[t] <- ts$o7d365[t-1]+(ts$n30[t]-ts$n365[t])                  # Records how much the current news attention is above the 365day average.
        ts$o30d1461[t] <- ts$o7d1461[t-1]+(ts$n30[t]-ts$n1461[t])               # Records how much the current news attention is above the 1461day average.
        ts$o30d3652[t] <- ts$o7d3652[t-1]+(ts$n30[t]-ts$n3652[t])               # Records how much the current news attention is above the 3652day average.
        }
      if(t%in%(seq(1000,100000,1000))) print(t)
      flush.console()
      }
  tss[[i]] <- ts                                                            # Save the result in a list that comprises the time series for all 120 different topics
  print(i)                                                                  # Will report how many topics are already completed.
  flush.console()                                                               # Shows the counter, refreshed screen
  }


  lapply(tss,FUN="wavecounter7")

  lapply(tss,FUN="wavecounter")


  wavedata <- data.frame(topic=factor(1),waveanalyzer(tss[[1]]))
  for (i in 2:120)
  {
    wavedata <- rbind(wavedata,data.frame(topic=factor(i),waveanalyzer(tss[[i]])))
    print(i)
    flush.console()
  }

  wavedata$TOPIC <- Recode(wavedata$topic,"'1'='International crisis'; '2'='Sex, gender';'3'='Age, youth, old age';'4'='Money, card, payment, identity';'5'='Warning';'6'='Law, petition';'7'='Writing, book';'8'='Communication Technology';'9'='<OCR>';'10'='Title';
  '11'='South Africa';'12'='Boss';'13'='Strike/Unions';'14'='Family/Education';'15'='Percentages';'16'='Epidemic';'17'='<OCR>';'18'='Military/Generals';'19'='Europe/EU';'20'='Market/Stock/Prices/Investment';
  '21'='Sports/Cup/Play/Win';'22'='Party/Election/Coalition';'23'='Profits/Dividends/Turnover/Sales/Tax';'24'='Disaster/Victims/Rescue/Survivor/Missing';'25'='Season/Month/Influenza';'26'='Army/Service/Volunteers';'27'='Film/Cinema/Comedy/Drama';'28'='China/Asia/Hong Kong';'29'='Oil/Petrol/Fuel/Barrel';'30'='Medicine/Drugs/Virus/Disease/Tests';
  '31'='Egypt/Cairo/Pasha/Telegram';'32'='Week/Weekday';'33'='Railways/Companies/Shareholders';'34'='Cost/Wage/Productivity/Increase/Inflation/Prices';'35'='Colors';'36'='Research/Survey/Data/Analysis';'37'='Nobility';'38'='<OCR>';'39'='Trade/Balance/Exports/Import/Wool';'40'='Bill/Debate/Government/Opposition/Legislation';
  '41'='Ship/Crew/Passengers';'42'='Earthquake/Town/Destroyed/Killed';'43'='Government/Crisis/Drastic/Situation/Measures/Urgency';'44'='School/Education/Teachers/Pupils';'45'='Wrong/Reality/Believes/Culture/Focus/Media';'46'='Space/Launch/Survive';'47'='Journalism/Press/Newspaper';'48'='Commission/Committee/Report/Investigation';'49'='Coal/Miners/Mine/Pit/Colliery/Explosion';'50'='Million//Billion/Investor/Fund/Equity/Merger/Hedge';
  '51'='Army/Military/War/Troops';'52'='Train/Railway/Accident/Passenger';'53'='Famine/Aid/Relief/Refugees';'54'='Industry/Production/Steel/Manufacturing/Orders';'55'='Church/Catholic/Bishop/Pope'; '56'='President/political/government/country/power';'57'='Health/Hospital/Care/Patients/Doctors/Nurses';'58'='Cabinet/Minister/Resignation/Dissolution/Deputy/Trust';'59'='Football/Club/Fans'; '60'='Power/Energy/Gas/Nuclear/Electricity';
  '61'='Fever/Districts/Precautions/Cases';'62'='Work/Job/Staff/Employee/Employment/Unemployment/Redundancies';'63'='Flood/Water/Dam/River/Rain';'64'='Germany/German/Chancellor';'65'='Sea/Island/Rig/Fish/Ocean';'66'='Britain/Commonwealth/Trade/Agreement/Beef';'67'='Filler words';'68'='Israel/Prime Minister/Meetings/Talks';'69'='Children/Family/Parents/Stress';'70'='Sugar/Expenditure/Tax/Revenue';
  '71'='ISIS/Iraq/Iran/Syria/Terrorism';'72'='University/Training/Skills/Technology';'73'='Officers/Commanders/Memorial';'74'='Transport/Road/Rail/Traffic/Bus';'75'='Business/Company/Insurers/Firm';'76'='Weakening Terms';'77'='News/TV/Radio/Music';'78'='Austria/Bulgaria/Russia/Emperor';'79'='Currency/Inflation/Exchange Rate/Yen/Dollar/Stirling';'80'='Cotton/Meeting/Conference/Committee/Federation';
  '81'='Title2';'82'='Gallery/Art/Exhibition/Theatre/Festival';'83'='Air/Aircraft/Flight/Aviation';'84'='Ambulance/Spokesperson';'85'='Greece/Turkey';'86'='Bridge/Roof/Collapse';'87'='Banks/Financial/Credit/Debt/Mortgage';'88'='Australia/New Zealand/Canada/New York';'89'='Economy/Economic Policy/Inflation/Unemployment/Successive';'90'='Challenge';
  '91'='Tax/Budget/Spending/Pension/Government/Cuts/Deficit';'92'='Damages/Court/Justice/Legal/Law/Lawyer';'93'='Word fragments';'94'='Christmas/Holiday/Tourists/Sales/Wine';'95'='USA/Washington/America/President';'96'='Coffee/Property/Market/Buyers/Price';'97'='Evidence/Witness/Coroner/Jury/Inquiry/Evidence';'98'='Recession/Gorwth/Economy/Consumer/Sector';'99'='Ireland/Irish/Unionist';'100'='Tin/Contract/Creditor/Copper/Exchange/Metal';
  '101'='University/Professor/Students/College/Oxbridge';'102'='Nation/Power/Country/People/Great/Empire';'103'='Coronavirus/Bird/Pandemic';'104'='Animal/Cattle/Mouth/Famers/Food/Ban';'105'='Building/Housing/Homes/Homeless/Charities';'106'='Bank/Gold/Loan/Currency/Silver/Circulation';'107'='Korea/Japan/Brazil';'108'='Russia/Soviet Union/Polish/Communist';'109'='King/Queen/Royal/Majesty';'110'='Police/Prison/Crime/Murder';
  '111'='Fire/Explosion/Flame/Burning/Smoke';'112'='London/Local/Council/Borough';'113'='India/Delhi/Colony/Province/Governor';'114'='Car Industry/Motor/Sales/Company';'115'='Food/Farmers/Agriculture/Wheat/Grain/Crop';'116'='France/Portugal/Spain/Belgium';'117'='Labour/Conservative/Tories/Cameron/Thatcher/Johnson/Blair';'118'='War/Peace/Treaty/NATO/Allies';'119'='Lordship/Gentleman/Majesty';'120'='Crowd/Mob/Hunt/Castle/Soldiers'")

  wavedata$AREA <- Recode(wavedata$topic,"'1'='Geopolitical';'2'='Society';'3'='Society';'4'='Economy';'5'='Functional';'6'='Government';'7'='Leisure';'8'='Technology';'9'='Misc';'10'='Functional';
  '11'='Geopolitical';'12'='Functional';'13'='Labor';'14'='Family';'15'='Economy';'16'='Epidemic';'17'='Misc';'18'='Military';'19'='Geopolitical';'20'='Economy';
  '21'='Leisure';'22'='Government';'23'='Economy';'24'='Disaster';'25'='Epidemic';'26'='Military';'27'='Leisure';'28'='Geopolitical';'29'='Energy';'30'='Health';
  '31'='Geopolitical';'32'='Functional';'33'='Transport';'34'='Economy';'35'='Functional';'36'='Science';'37'='Functional';'38'='Misc';'39'='Economy';'40'='Government';
  '41'='Transport';'42'='Disaster';'43'='Government';'44'='Education';'45'='Functional';'46'='Technology';'47'='Public';'48'='Justice';'49'='Disaster';'50'='Economy';
  '51'='Military';'52'='Disaster';'53'='Disaster';'54'='Economy';'55'='Society';'56'='Government';'57'='Health';'58'='Government';'59'='Leisure';'60'='Energy';
  '61'='Epidemic';'62'='Labor';'63'='Disaster';'64'='Geopolitical';'65'='Geopolitical';'66'='Economy';'67'='Functional';'68'='Geopolitical';'69'='Family';'70'='Economy';
  '71'='Geopolitical';'72'='Education';'73'='Military';'74'='Transport';'75'='Economy';'76'='Functional';'77'='Public';'78'='Geopolitical';'79'='Economy';'80'='Economy';
  '81'='Functional';'82'='Leisure';'83'='Transport';'84'='Disaster';'85'='Geopolitical';'86'='Disaster';'87'='Economy';'88'='Geopolitical';'89'='Economy';'90'='Functional';
  '91'='Economy';'92'='Justice';'93'='Functional';'94'='Leisure';'95'='Geopolitical';'96'='Economy';'97'='Justice';'98'='Economy';'99'='Geopolitical';'100'='Economy';
  '101'='Education';'102'='Government';'103'='Epidemics';'104'='Economy';'105'='Welfare';'106'='Economy';'107'='Geopolitical';'108'='Geopolitical';'109'='Government';'110'='Justice';
  '111'='Disaster';'112'='Geopolitical';'113'='Geopolitical';'114'='Economy';'115'='Economy';'116'='Geopolitical';'117'='Government';'118'='Military';'119'='Functional';'120'='Public'")

  wavedata$AREA2 <- Recode(wavedata$topic,"'1'='Geopolitical';'2'='Society';'3'='Society';'4'='Economy/Market';'5'='Functional';'6'='Government/Legislative';'7'='Leisure/Literature';'8'='Technology/Communication';'9'='Misc';'10'='Functional';
  '11'='Location/Africa';'12'='Functional';'13'='Labor';'14'='Family';'15'='Economy/Numbers';'16'='Epidemic';'17'='Misc';'18'='Military';'19'='Location/Europe';'20'='Economy/Market';
  '21'='Leisure/Sports';'22'='Government/Executive';'23'='Economy/Business';'24'='Disaster';'25'='Epidemic';'26'='Military';'27'='Leisure/Drama';'28'='Location/Asia';'29'='Energy';'30'='Health';
  '31'='Location/MiddleEast';'32'='Functional/Time';'33'='Transport';'34'='Economy/Secondary';'35'='Functional';'36'='Science';'37'='Functional/Address';'38'='Misc';'39'='Economy/Primary';'40'='Government/Legislative';
  '41'='Transport';'42'='Disaster';'43'='Government/Executive';'44'='Education';'45'='Functional';'46'='Technology';'47'='Public';'48'='Justice/Investigation';'49'='Disaster';'50'='Economy/Investor';
  '51'='Military';'52'='Disaster';'53'='Disaster';'54'='Economy/Secondary';'55'='Society';'56'='Government/Executive';'57'='Health';'58'='Government/Executive';'59'='Leisure';'60'='Energy';
  '61'='Epidemic';'62'='Labor';'63'='Disaster';'64'='Location/Europe';'65'='Location/Oceans';'66'='Economy/Primary';'67'='Functional';'68'='Location/MiddleEast';'69'='Family';'70'='Economy/Primary';
  '71'='Location/NearEast';'72'='Education';'73'='Military';'74'='Transport';'75'='Economy/Financial';'76'='Functional';'77'='Public';'78'='Location/Europe';'79'='Economy/Currency';'80'='Economy/Trade';
  '81'='Functional';'82'='Leisure';'83'='Transport';'84'='Disaster';'85'='Location/Europe';'86'='Disaster';'87'='Economy/Financial';'88'='Location/Oceania';'89'='Economy/Policy';'90'='Functional';
  '91'='Economy/Policy';'92'='Justice/Case';'93'='Functional';'94'='Leisure';'95'='Location/America';'96'='Economy/Market';'97'='Justice/Case';'98'='Economy/Macro';'99'='Location/Europe';'100'='Economy/Primary';
  '101'='Education';'102'='Government/Executive';'103'='Epidemics';'104'='Economy/Primary';'105'='Welfare';'106'='Economy/Currency';'107'='Location/Asia';'108'='Location/Europe';'109'='Government/Royals';'110'='Justice/Case';
  '111'='Disaster';'112'='Location/UK';'113'='Location/Asia';'114'='Economy/Secondary';'115'='Economy/Primary';'116'='Location/Europe';'117'='Government/Executive';'118'='Military';'119'='Functional';'120'='Society'")



  newsvolume <- data.frame(year=thetimes$year,articles=thetimes$articles)
  newsvolume$threshold <- newsvolume$articles*0.0004
  newsvolume$threshold2 <- ifelse(newsvolume$threshold>5,newsvolume$threshold,5)
  wd20$year <- substring(as.Date(as.POSIXct(wd20$start*60*60*24,origin="1784-12-31"),format="%Y"),1,4)

  wd20$thresh <- newsvolume[match(wd20$year,newsvolume$year),"threshold"]
  wd20$thresh2 <- ifelse(wd20$thresh>5,wd20$thresh,5)

  wd20x <- subset(wd20,volume>thresh2/5)

  ggplot(wd20x,aes(x=as.numeric(year),y=volume))+geom_point()+theme_bluewhite()+scale_x_continuous(breaks=seq(1780,2020,20))+geom_smooth()
  ggplot(wd20x,aes(x=as.numeric(year),y=duration))+geom_point()+theme_bluewhite()+scale_x_continuous(breaks=seq(1780,2020,20))+geom_smooth()
  ggplot(wd20x,aes(x=as.numeric(year),y=intensity))+geom_point()+theme_bluewhite()+scale_x_continuous(breaks=seq(1780,2020,20))+geom_smooth()

  ggplot(data.frame(table(wd20x$year)),aes(x=as.numeric(Var1),y=Freq,label=Var1))+geom_text()+theme_bluewhite()+geom_smooth()

  wavedata.vol50 <- subset(wavedata,volume>thresh2)

  wavedata.vol50$thresh2 <- ifelse(wavedata.vol50$thresh>5,wavedata.vol50$thresh,5)

  wd5 <- subset(wavedata.vol50,volume>thresh2)

  nover <- 10
  nunder <- 10
  pre <- 1000
  post <- 1000
  start <- wavedata.vol50[1,"start"]
  end <- wavedata.vol50[1,"end"]
  topic <- wavedata.vol50[1,"TOPIC"]
  area <- wavedata.vol50[1,"AREA"]

  start <- wavedata.vol50[452,"start"]
  end <- wavedata.vol50[452,"end"]
  topic <- wavedata.vol50[452,"TOPIC"]
  area <- wavedata.vol50[452,"AREA"]


  topic.eq <- data.frame(topic=1:120,topicX=paste0("X",1:120))

  wide_STM <- wide.STM[match(dtm@docvars$id,wide.STM$id),]

  dtm@docvars$days <- wide.STM$days[match(dtm@docvars$id,wide.STM$id)]

  tdtm2@docvars$days <- dtm@docvars$days
  tdtm3@docvars$days <- dtm@docvars$days

  dtm <- tdtm2

  adtm1 <- tdtm3[dtm@docvars$id%in%docvars(cpx)$id,]
  adtm2 <- tdtm3[tdtm2@docvars$id%in%docvars(cpx)$id,]
  adtm3 <- tdtm3[tdtm3@docvars$id%in%docvars(cpx)$id,]

  adtm1@docvars$days <- wide.STM$days
  adtm2@docvars$days <- wide.STM$days
  adtm3@docvars$days <- wide.STM$days

  compareVocabulary <- function(data=data,pre=pre,post=post,start=start,end=end,nover=nover,nunder=nunder,topic=topic,area=area)
    {
    startdate <- as.POSIXct(start*60*60*24,origin="0000-01-01")
    enddate   <- as.POSIXct(end*60*60*24,origin="0000-01-01")
    # pre.bag.dtm   <- dfm_subset(data,days>(start-pre) & days<start) # without topic requirement
    # target.bag.dtm    <- dfm_subset(data,days>(start) & days<end)   # without topic requirement
    # post.bag.dtm  <- dfm_subset(data,days>(end) & days<(end+post))  # without topic requirement
    pre.bag.dtm     <- dfm_subset(data,days>(start-pre) & days<start)
    target.bag.dtm  <- dfm_subset(data,days>(start) & days<end  & wide.STM[,topic.eq[as.numeric(topic),"topicX"]]>0.005)
    post.bag.dtm    <- dfm_subset(data,days>(end) & days<(end+post))
    full.bag    <- colSums(pre.bag.dtm)+colSums(target.bag.dtm)+colSums(post.bag.dtm)
    pre.bag     <- ifelse(colSums(pre.bag.dtm)[full.bag>0]>0,colSums(pre.bag.dtm)[full.bag>0],1)
    target.bag  <- ifelse(colSums(target.bag.dtm)[full.bag>0]>0,colSums(target.bag.dtm)[full.bag>0],1)
    post.bag    <- ifelse(colSums(post.bag.dtm)[full.bag>0]>0,colSums(post.bag.dtm)[full.bag>0],1)
    pre.pr      <- pre.bag/sum(pre.bag,na.rm=TRUE)
    target.pr   <- target.bag/sum(target.bag,na.rm=TRUE)
    post.pr     <- post.bag/sum(post.bag,na.rm=TRUE)

    ### Pre-to-target
    pre.prfactor    <- (target.pr/pre.pr)
    pre.prraw       <- (target.pr-pre.pr)
    pre.fraw        <- (target.bag-pre.bag)
    pre.voc.change <- data.frame(feature=names(pre.prfactor),pre.bag,target.bag,pre.pr,target.pr,probability.factor=pre.prfactor,probability.jump=pre.prraw,frequency.jump=pre.fraw,frequency.jump.relative=pre.fraw/target.bag)
    pre.top.voc.change <- subset(pre.voc.change,target.bag>10&target.pr>0.0001 & (probability.factor + probability.jump*5000>2))
    pre.top20.rchange <- pre.top.voc.change[order(pre.top.voc.change$probability.factor*1000*pre.top.voc.change$target.pr,decreasing=TRUE),][1:nover,]
    pre.top20.fchange <- pre.top.voc.change[order(pre.top.voc.change$frequency.jump*1000*pre.top.voc.change$target.pr,decreasing=TRUE),][1:nover,]

    ### Post-to-target
    post.prfactor   <- (target.pr/post.pr)
    post.prraw      <- (target.pr-post.pr)
    post.fraw           <- (target.bag-post.bag)
    post.voc.change <- data.frame(feature=names(post.prfactor),post.bag,target.bag,post.pr,target.pr,
                probability.factor=post.prfactor,probability.jump=post.prraw,
                frequency.jump=post.fraw,frequency.jump.relative=post.fraw/target.bag)
    post.top.voc.change <- subset(post.voc.change,target.bag>10&target.pr>0.0001 & (probability.factor>2 & probability.jump>0.0001))
    post.top20.rchange <- post.top.voc.change[order(post.top.voc.change$probability.factor,decreasing=TRUE),][1:nover,]
    post.top20.fchange <- post.top.voc.change[order(post.top.voc.change$frequency.jump,decreasing=TRUE),][1:nover,]

    wordlist <- paste(  unique(c(as.character(pre.top20.rchange$feature),
              as.character(post.top20.rchange$feature),
              as.character(pre.top20.fchange$feature),
              as.character(post.top20.fchange$feature))))

    topic.textlist <- data.frame(id=wide.STM[wide.STM$id%in%target.bag.dtm@docvars$id,"id"],prob=wide.STM[wide.STM$id%in%target.bag.dtm@docvars$id,topic.eq[as.numeric(topic),"topicX"]],headline=wide.STM[wide.STM$id%in%target.bag.dtm@docvars$id,"headline"],date=wide.STM[wide.STM$id%in%target.bag.dtm@docvars$id,"Time"])
    # order(topic.textlist$prob)

    return(list(startdate=startdate,enddate=enddate,
          topic=topic,area=area,
          wordlist=wordlist,
          texts=topic.textlist[order(topic.textlist$prob,decreasing=TRUE),],
          top20.relative.change.pre.to.target=pre.top20.rchange,
          top20.relative.change.post.to.target=post.top20.rchange,
          top20.absolute.change.pre.to.target=pre.top20.fchange,
          top20.absolute.change.post.to.target=post.top20.fchange))
    }

  yearlist <- which(wavedata.vol50$year>2014)

  x <- compareVocabulary(data=tdtm3,pre=1000,post=5,start=wavedata2$start[4],end=wavedata2$end[4],nover=10,nunder=10,topic=wavedata2$TOPIC[4],area=wavedata2$AREA[4])

  pre <- 1000
  post <- 1000
  start <- wd5$start[4]
  end <- wd5$end[4]
  nover <- 10
  nunder <- 10
  topic <- wd5$TOPIC[4]
  area <- wd5$AREA[4]

  compareVocabulary(data=adtm3,pre=pre,post=post,start=start,end=end,nover=nover,nunder=nunder,topic=topic,area=area)

  wavedata2 <- subset(wd5,!is.na(TOPIC))

  W <- dim(wavedata2)[1]
  wave.keywords <- list()
  for (w in 1:W)
  {
    wave.keywords[[w]] <- compareVocabulary(data=adtm3,pre=1000,post=1000,start=wavedata2$start[w],end=wavedata2$end[w],nover=10,nunder=10,topic=wavedata2$TOPIC[w],area=wavedata2$AREA[w])
    flush.console()
    print(paste0(w,"/",W))
  }

  WW <- length(wave.keywords)
  wave.keywords2 <- list()
  for (ww in 1:WW)
  {
    wave.keywords2[[ww]] <- compareVocabulary(data=adtm1,pre=1000,post=1000,start=wavedata2$start[ww],end=wavedata2$end[ww],nover=10,nunder=10,topic=wavedata2$TOPIC[ww],area=wavedata2$AREA[ww])
    flush.console()
    print(paste0(ww,"/",WW))
  }

  headline_extract <- function(x)
  {
    hl <- x$texts$headline[1:10]
    return(hl)
  }

  wkm <- t(matrix(unlist(lapply(wave.keywords2,headline_extract)),nrow=10))

#  "1973 Wedding of Princess Anne",
#  "1982 Falkland War: Papal visit to Argentine/Call for Peace",
#  "2008 Lehman Crisis: Trust Crisis (religious terms)",
#  "2020 Covid-19: Air Travel Restrictions",
#  "2020 Covid-19: Economic Repercussions",
#  "1982 Falkland War: Escalation/Warnings/Ceasefire",
#  "2008 Lehman Crisis: Bailouts across Europe/early phase",
#  "2020 Covid-19: Warnings/Fears",
#  "1807 Law/petition ???",
#  [10]
#  "1819 Law/petition ???",
#  "1819/20 Law/petition ???",
#  "1821 Law/petition ???",
#  "1821 Law/petition ???",
#  "1824 Law/petition ???",
#  "2008 Lehman Crisis: Various repercussions",
#  "2020 Covid-19: Various repercussions",
#  "2020 Covid-19: Tracking Apps",
#  "1982 Falkland War: Identity/Allegiance of Falklanders",
#  "2008 Lehman Crisis: Bank Misconduct/Bank Failure/Bank Bailout",
#  [20]
#  "2020 Covid-19: ???",
#  "1899 HMS Bullfinch Disaster 1899 (Ship fire)",          # https://en.wikipedia.org/wiki/HMS_Bullfinch_(1898)
#  "2020 Covid-19: ???",
#  "1891 Government Crises in Italy/Serbia/Norway 1891",    
#  "1919 U.S. Steel Strikes and Industrial Crisis", # https://en.wikipedia.org/wiki/Steel_strike_of_1919
#  "1920 Strikes around Europe",
#  "1973/74 Oil Crisis: Workers reactions",
#  "1979/80 Various Strikes",
#  "1973/74 Oil Crisis: Early phase/worries/downturn/recession",
#  "1991: Recession", #https://en.wikipedia.org/wiki/Early_1990s_recession
#  [30]
#  "2008: Lehman Crisis: Early Phase/worries/downturn/recession",
#  "2020: Covid-19: Early phase/economic worries/downturn/recession", 
#  [32]
#  "1849: Cholera Epidemic (London, Liverpool, Hull)", #https://en.wikipedia.org/wiki/1846%E2%80%931860_cholera_pandemic
#  "1853: Cholera Epidemic (London)", # https://en.wikipedia.org/wiki/1846%E2%80%931860_cholera_pandemic
#  "1854: Cholera Epidemic (London)", # https://en.wikipedia.org/wiki/1846%E2%80%931860_cholera_pandemic
#  "1861: Doab famine of 1860–1861 (India) / London And Brighton Railway Catastrophe / Clayton Tunnel rail crash", #https://en.wikipedia.org/wiki/Upper_Doab_famine_of_1860%E2%80%931861
#  "1864: **Belvedere Catastrophe / Gunpower explosion at Erith", # http://www.dover-kent.com/Other-info/1864-Erith-Explosion.html
#  "1883: *Cholera in Egypt / Disaster on the Clyde / SS Daphne / Victoria Hall Disaster", # https://en.wikipedia.org/wiki/SS_Daphne_(1883) # https://en.wikipedia.org/wiki/Victoria_Hall_disaster
#  "1884: Enteric fever / Kidderminster" # https://ur.booksc.eu/book/24502636/9b5a3b
#  [40]
#  "1885: **Gladstone-to-Salisbury Transition",#https://en.wikipedia.org/wiki/Robert_Gascoyne-Cecil,_3rd_Marquess_of_Salisbury#Domestic_policy
#  "1886: **Shipping (Victoria) and colliery (Leicestershire) disasters",
#  "1887: Scarlet Fever in London", #https://trove.nla.gov.au/newspaper/article/7898280
#  "1890: Influenza Epidemic", #https://en.wikipedia.org/wiki/1889%E2%80%931890_pandemic
#  "1892: Influenza Epidemic", #https://en.wikipedia.org/wiki/1889%E2%80%931890_pandemic
#  "1892: *Various accidents, influenza at schools",
#  "1893: *Cholera / Coal Trade Crisis",
#  "1894: Smallpox Epidemic",
#  "1897: Typhoid Epidemic at Maidstone",
#  "1908: *Cholera Epidemic / Near-East Crisis / Colliery Disaster",
#  [50]
#  "1911: Measles Epidemic",
#  "1918: Influenza Epidemic (Spanish Flu)",
#  "1918: Influenza Epidemic (Spanish Flu)",
#  "1923/24: Foot-and-Mouth Disease",
#  "1927: *Influenza Epidemic / Chinese Crisis / Colliery Disaster",
#  "1947: *Cholera in Egypt / Anti-Communist Campaigns",
#  "1951: *Influenza (Liverpool), NHS, Dutch Cabinet Crisis",
#  "1953: North Sea Floods", # https://en.wikipedia.org/wiki/North_Sea_flood_of_1953#United_Kingdom
#  "1957: Asiatic Influenza", # https://en.wikipedia.org/wiki/1957%E2%80%931958_influenza_pandemic
#  "1965: *Polio",
#  [60]
#  "1967: *Cattle Disease South Africa",
#  "2020: Covid-19",
#
#  [124] 
#  "1889/90 Influenza",
#  "1918/19 Spanish Flu",
#  "1938: **Munich Conference/Nazi threat",
#  "1947: **After-War shortages", 
#  "1973/74: **Oil Crisis: Shortages",
#  "1982: **Falkland War: Shortages ",
#  [130]
#  "1991: *Recession/Cholera",
#  "2008: **Lehman Crisis: Shortages",
#  "2011: **European Debt Crisis: Shortages",
#  "2020: Covid-19",
#
#  [441]
#  "1819: Fever in Spain",
#  "1846: ??? South Africa",
#  "1849: Cholera (London)",
#  "1853: Cholera",
#  "1865: Typhus/Russian Epidemic/Russian Pestilence",
#  "1865: Cholera (Mediterranean)",
#  "1866: Cholera",
#  "1868: **Catastrophe at Hull",
#  "1871: **Explosion on a steamer",
#  [450]
#  "1873: **Fires and political crises",
#  "1873: **American Financial Crisis/Disaster on the river Prah",
#  "1876: *Sick and wounded",
#  "1878: *Yellow Fever / Colliery Explosion",
#  "1879/80: *Colliery explosions and bridge collapses",
#  "1883: *Cholera/Disaster on Clyde",
#  "1884: Cholera at Naples",
#  "1885: Cholera in Spain",
#  "1887: Scarlet Fever Epidemic",
#  "1889: *Floods/Explosions/Yellow Fever",
#  [460]
#  "1890: Influenza Epidemic",
#  "1890: *Shipwrecks/Irish Crisis",
#  "1892: *Influenza/Famine in Russia/Mining Disaster",
#  "1892: *Cholera/Chamonix Disaster/Hurricane/Colliery Disaster",
#  "1896: *Transvaal Crisis/Colliery Disaster/Anglo-American Crisis",
#  "1897: Typhoid Epidemic Maidstone",
#  "1901/02: *Several Mining/Ship/Natural Disasters (Earthquake)",
#  "1907: *Wine Crisis in France, Earthquake in Jamaica",
#  "1907: *Earthquake in Italy/Financial Crisis in New York",
#  "1910/11: *Colliery explosion, railway disaster, Fire",
#  [470]
#  "1918: Spanish Flu",
#  "1938: *Czech crisis/Fall of Canton",
#  [841]
#  "2020: Covid-19"

#  125 126 127 128 129 130 131 132 133 441 442 443 444 445 446
#  447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471]

  W <- dim(wavedata.vol50)[1]

  wave.keywords <- list()

  for (w in 1:W)
  {
    wave.keywords[[w]] <- compareVocabulary(pre=1000,post=1000,start=wavedata.vol50$start[w],end=wavedata.vol50$end[w],nover=10,nunder=10,topic=wavedata.vol50$TOPIC[w],area=wavedata.vol50$AREA[w])
    flush.console()
    print(paste0(w,"/",W))
  }
  # wave.keywords <- x

  for (w in 1:W)
  {
    wavedata.vol50$startdate[w] <- wave.keywords[[w]]$startdate
    wavedata.vol50$enddate[w] <- wave.keywords[[w]]$enddate
    wavedata.vol50$wordlist[w] <- paste(wave.keywords[[w]]$wordlist,sep="",collapse="///") 
  }

  wavedata.vol50$decade <- floor(as.numeric(wavedata.vol50$year)/10)*10


  wavedata.vol50$TOPIC <- Recode(wavedata.vol50$topic,"'1'='International crisis'; '2'='Sex, gender';'3'='Age, youth, old age';'4'='Money, card, payment, identity';'5'='Warning';'6'='Law, petition';'7'='Writing, book';'8'='Communication Technology';'9'='<OCR>';'10'='Title';
  '11'='South Africa';'12'='Boss';'13'='Strike/Unions';'14'='Family/Education';'15'='Percentages';'16'='Epidemic';'17'='<OCR>';'18'='Military/Generals';'19'='Europe/EU';'20'='Market/Stock/Prices/Investment';
  '21'='Sports/Cup/Play/Win';'22'='Party/Election/Coalition';'23'='Profits/Dividends/Turnover/Sales/Tax';'24'='Disaster/Victims/Rescue/Survivor/Missing';'25'='Season/Month/Influenza';'26'='Army/Service/Volunteers';'27'='Film/Cinema/Comedy/Drama';'28'='China/Asia/Hong Kong';'29'='Oil/Petrol/Fuel/Barrel';'30'='Medicine/Drugs/Virus/Disease/Tests';
  '31'='Egypt/Cairo/Pasha/Telegram';'32'='Week/Weekday';'33'='Railways/Companies/Shareholders';'34'='Cost/Wage/Productivity/Increase/Inflation/Prices';'35'='Colors';'36'='Research/Survey/Data/Analysis';'37'='Nobility';'38'='<OCR>';'39'='Trade/Balance/Exports/Import/Wool';'40'='Bill/Debate/Government/Opposition/Legislation';
  '41'='Ship/Crew/Passengers';'42'='Earthquake/Town/Destroyed/Killed';'43'='Government/Crisis/Drastic/Situation/Measures/Urgency';'44'='School/Education/Teachers/Pupils';'45'='Wrong/Reality/Believes/Culture/Focus/Media';'46'='Space/Launch/Survive';'47'='Journalism/Press/Newspaper';'48'='Commission/Committee/Report/Investigation';'49'='Coal/Miners/Mine/Pit/Colliery/Explosion';'50'='Million//Billion/Investor/Fund/Equity/Merger/Hedge';
  '51'='Army/Military/War/Troops';'52'='Train/Railway/Accident/Passenger';'53'='Famine/Aid/Relief/Refugees';'54'='Industry/Production/Steel/Manufacturing/Orders';'55'='Church/Catholic/Bishop/Pope'; '56'='President/political/government/country/power';'57'='Health/Hospital/Care/Patients/Doctors/Nurses';'58'='Cabinet/Minister/Resignation/Dissolution/Deputy/Trust';'59'='Football/Club/Fans'; '60'='Power/Energy/Gas/Nuclear/Electricity';
  '61'='Fever/Districts/Precautions/Cases';'62'='Work/Job/Staff/Employee/Employment/Unemployment/Redundancies';'63'='Flood/Water/Dam/River/Rain';'64'='Germany/German/Chancellor';'65'='Sea/Island/Rig/Fish/Ocean';'66'='Britain/Commonwealth/Trade/Agreement/Beef';'67'='Filler words';'68'='Israel/Prime Minister/Meetings/Talks';'69'='Children/Family/Parents/Stress';'70'='Sugar/Expenditure/Tax/Revenue';
  '71'='ISIS/Iraq/Iran/Syria/Terrorism';'72'='University/Training/Skills/Technology';'73'='Officers/Commanders/Memorial';'74'='Transport/Road/Rail/Traffic/Bus';'75'='Business/Company/Insurers/Firm';'76'='Weakening Terms';'77'='News/TV/Radio/Music';'78'='Austria/Bulgaria/Russia/Emperor';'79'='Currency/Inflation/Exchange Rate/Yen/Dollar/Stirling';'80'='Cotton/Meeting/Conference/Committee/Federation';
  '81'='Title2';'82'='Gallery/Art/Exhibition/Theatre/Festival';'83'='Air/Aircraft/Flight/Aviation';'84'='Ambulance/Spokesperson';'85'='Greece/Turkey';'86'='Bridge/Roof/Collapse';'87'='Banks/Financial/Credit/Debt/Mortgage';'88'='Australia/New Zealand/Canada/New York';'89'='Economy/Economic Policy/Inflation/Unemployment/Successive';'90'='Challenge';
  '91'='Tax/Budget/Spending/Pension/Government/Cuts/Deficit';'92'='Damages/Court/Justice/Legal/Law/Lawyer';'93'='Word fragments';'94'='Christmas/Holiday/Tourists/Sales/Wine';'95'='USA/Washington/America/President';'96'='Coffee/Property/Market/Buyers/Price';'97'='Evidence/Witness/Coroner/Jury/Inquiry/Evidence';'98'='Recession/Gorwth/Economy/Consumer/Sector';'99'='Ireland/Irish/Unionist';'100'='Tin/Contract/Creditor/Copper/Exchange/Metal';
  '101'='University/Professor/Students/College/Oxbridge';'102'='Nation/Power/Country/People/Great/Empire';'103'='Coronavirus/Bird/Pandemic';'104'='Animal/Cattle/Mouth/Famers/Food/Ban';'105'='Building/Housing/Homes/Homeless/Charities';'106'='Bank/Gold/Loan/Currency/Silver/Circulation';'107'='Korea/Japan/Brazil';'108'='Russia/Soviet Union/Polish/Communist';'109'='King/Queen/Royal/Majesty';'110'='Police/Prison/Crime/Murder';
  '111'='Fire/Explosion/Flame/Burning/Smoke';'112'='London/Local/Council/Borough';'113'='India/Delhi/Colony/Province/Governor';'114'='Car Industry/Motor/Sales/Company';'115'='Food/Farmers/Agriculture/Wheat/Grain/Crop';'116'='France/Portugal/Spain/Belgium';'117'='Labour/Conservative/Tories/Cameron/Thatcher/Johnson/Blair';'118'='War/Peace/Treaty/NATO/Allies';'119'='Lordship/Gentleman/Majesty';'120'='Crowd/Mob/Hunt/Castle/Soldiers'")

  wavedata.vol50$AREA <- Recode(wavedata.vol50$topic,"'1'='Geopolitical';'2'='Society';'3'='Society';'4'='Economy';'5'='Functional';'6'='Government';'7'='Leisure';'8'='Technology';'9'='Misc';'10'='Functional';
  '11'='Geopolitical';'12'='Functional';'13'='Labor';'14'='Family';'15'='Economy';'16'='Epidemic';'17'='Misc';'18'='Military';'19'='Geopolitical';'20'='Economy';
  '21'='Leisure';'22'='Government';'23'='Economy';'24'='Disaster';'25'='Epidemic';'26'='Military';'27'='Leisure';'28'='Geopolitical';'29'='Energy';'30'='Health';
  '31'='Geopolitical';'32'='Functional';'33'='Transport';'34'='Economy';'35'='Functional';'36'='Science';'37'='Functional';'38'='Misc';'39'='Economy';'40'='Government';
  '41'='Transport';'42'='Disaster';'43'='Government';'44'='Education';'45'='Functional';'46'='Technology';'47'='Public';'48'='Justice';'49'='Disaster';'50'='Economy';
  '51'='Military';'52'='Disaster';'53'='Disaster';'54'='Economy';'55'='Society';'56'='Government';'57'='Health';'58'='Government';'59'='Leisure';'60'='Energy';
  '61'='Epidemic';'62'='Labor';'63'='Disaster';'64'='Geopolitical';'65'='Geopolitical';'66'='Economy';'67'='Functional';'68'='Geopolitical';'69'='Family';'70'='Economy';
  '71'='Geopolitical';'72'='Education';'73'='Military';'74'='Transport';'75'='Economy';'76'='Functional';'77'='Public';'78'='Geopolitical';'79'='Economy';'80'='Economy';
  '81'='Functional';'82'='Leisure';'83'='Transport';'84'='Disaster';'85'='Geopolitical';'86'='Disaster';'87'='Economy';'88'='Geopolitical';'89'='Economy';'90'='Functional';
  '91'='Economy';'92'='Justice';'93'='Functional';'94'='Leisure';'95'='Geopolitical';'96'='Economy';'97'='Justice';'98'='Economy';'99'='Geopolitical';'100'='Economy';
  '101'='Education';'102'='Government';'103'='Epidemics';'104'='Economy';'105'='Welfare';'106'='Economy';'107'='Geopolitical';'108'='Geopolitical';'109'='Government';'110'='Justice';
  '111'='Disaster';'112'='Geopolitical';'113'='Geopolitical';'114'='Economy';'115'='Economy';'116'='Geopolitical';'117'='Government';'118'='Military';'119'='Functional';'120'='Public'")

  wavedata.vol50$AREA2 <- Recode(wavedata.vol50$topic,"'1'='Geopolitical';'2'='Society';'3'='Society';'4'='Economy/Market';'5'='Functional';'6'='Government/Legislative';'7'='Leisure/Literature';'8'='Technology/Communication';'9'='Misc';'10'='Functional';
  '11'='Location/Africa';'12'='Functional';'13'='Labor';'14'='Family';'15'='Economy/Numbers';'16'='Epidemic';'17'='Misc';'18'='Military';'19'='Location/Europe';'20'='Economy/Market';
  '21'='Leisure/Sports';'22'='Government/Executive';'23'='Economy/Business';'24'='Disaster';'25'='Epidemic';'26'='Military';'27'='Leisure/Drama';'28'='Location/Asia';'29'='Energy';'30'='Health';
  '31'='Location/MiddleEast';'32'='Functional/Time';'33'='Transport';'34'='Economy/Secondary';'35'='Functional';'36'='Science';'37'='Functional/Address';'38'='Misc';'39'='Economy/Primary';'40'='Government/Legislative';
  '41'='Transport';'42'='Disaster';'43'='Government/Executive';'44'='Education';'45'='Functional';'46'='Technology';'47'='Public';'48'='Justice/Investigation';'49'='Disaster';'50'='Economy/Investor';
  '51'='Military';'52'='Disaster';'53'='Disaster';'54'='Economy/Secondary';'55'='Society';'56'='Government/Executive';'57'='Health';'58'='Government/Executive';'59'='Leisure';'60'='Energy';
  '61'='Epidemic';'62'='Labor';'63'='Disaster';'64'='Location/Europe';'65'='Location/Oceans';'66'='Economy/Primary';'67'='Functional';'68'='Location/MiddleEast';'69'='Family';'70'='Economy/Primary';
  '71'='Location/NearEast';'72'='Education';'73'='Military';'74'='Transport';'75'='Economy/Financial';'76'='Functional';'77'='Public';'78'='Location/Europe';'79'='Economy/Currency';'80'='Economy/Trade';
  '81'='Functional';'82'='Leisure';'83'='Transport';'84'='Disaster';'85'='Location/Europe';'86'='Disaster';'87'='Economy/Financial';'88'='Location/Oceania';'89'='Economy/Policy';'90'='Functional';
  '91'='Economy/Policy';'92'='Justice/Case';'93'='Functional';'94'='Leisure';'95'='Location/America';'96'='Economy/Market';'97'='Justice/Case';'98'='Economy/Macro';'99'='Location/Europe';'100'='Economy/Primary';
  '101'='Education';'102'='Government/Executive';'103'='Epidemics';'104'='Economy/Primary';'105'='Welfare';'106'='Economy/Currency';'107'='Location/Asia';'108'='Location/Europe';'109'='Government/Royals';'110'='Justice/Case';
  '111'='Disaster';'112'='Location/UK';'113'='Location/Asia';'114'='Economy/Secondary';'115'='Economy/Primary';'116'='Location/Europe';'117'='Government/Executive';'118'='Military';'119'='Functional';'120'='Society'")




  table(wavedata.vol50$AREA2,wavedata.vol50$decade)*
  x <- tapply(wavedata.vol50$volume,interaction(wavedata.vol50$decade,wavedata.vol50$AREA2),FUN="sum")
  x2 <- matrix(x,ncol=46,nrow=24)
  rownames(x2) <- seq(1790,2020,10)
  colnames(x2) <- names(table(wavedata.vol50$AREA2))
  x2 <- replace(x2,is.na(x2),0)

#  100*round(prop.table(x2,1),2)

  y <- tapply(wavedata.vol50$volume,interaction(wavedata.vol50$decade,wavedata.vol50$AREA),FUN="sum")
  y2 <- matrix(y,ncol=20,nrow=24)
  rownames(y2) <- seq(1790,2020,10)
  colnames(y2) <- names(table(wavedata.vol50$AREA))
  y2 <- replace(y2,is.na(y2),0)

#  100*round(prop.table(y2,1),2)


  arealist <- c('Economy/Business','Economy/Currency','Economy/Financial','Economy/Investor','Economy/Macro','Economy/Market',
      'Economy/Numbers','Economy/Policy','Economy/Primary','Economy/Secondary','Economy/Trade')

  ecoarea <- with(subset(wavedata.vol50,AREA=="Economy"),t(table(AREA2,decade)))
  ea_long <- melt(ecoarea)
  names(ea_long) <- c("Decade", "AREA", "Count")

  crisis_by_decade <- aggregate(ea_long$Count,by=list(ea_long$Decade),FUN="sum")

  ea_long$Total <- crisis_by_decade[match(ea_long$Decade,crisis_by_decade$Group.1),"x"]
  ea_long$Share <- round(100*(ea_long$Count/ea_long$Total),1)
  ea_long$AREA2 <- factor(ea_long$AREA,ordered=TRUE)

  # ggplot(subset(ea_long,AREA%in%arealist),aes(x=Decade,y=Count,group=AREA2,fill=AREA2))+geom_area()
  economic_crisis_categories <- ggplot(subset(ea_long,AREA%in%arealist),aes(x=Decade,y=Share,group=AREA2,fill=AREA2))+geom_col()+theme_light()+scale_x_continuous(breaks=seq(1800,2010,10))

  # x <- matrix(with(subset(wavedata.vol50,AREA=="Economy"),tapply(volume,interaction(AREA2,decade),mean,na.rm=TRUE)),nrow=39)
  # colnames(x) <- names(table(wavedata.vol50$decade))
  # rownames(x) <- names(table(wavedata.vol50$TOPIC))

  crisisarea <- t(table(wavedata.vol50$AREA,wavedata.vol50$decade))
  ca_long <- melt(crisisarea)
  names(ca_long) <- c("Decade", "AREA", "Count")

  binary_area <- t(table(wavedata.vol50$AREA=="Economy",wavedata.vol50$decade))
  ba_long <- melt(binary_area)
  names(ba_long) <- c("Decade", "AREA", "Count")
  ba_long$max <- rep(c(tapply(ba_long$Count,ba_long$AREA,max,na.tm=TRUE)),each=24)
  ba_long$Index <- round(100*ba_long$Count/ba_long$max,1)
  ba_long$CrisisType <- factor(Recode(ba_long$AREA,"'TRUE'='Economic crisis';'FALSE'='Non-economic crisis'"),ordered=TRUE)

  cbd2 <- aggregate(ca_long$Count,by=list(ca_long$Decade),FUN="sum")

  table(wavedata.vol50$AREA)[order(table(wavedata.vol50$AREA),decreasing=TRUE)]

  ca_long$Total <- cbd2[match(ca_long$Decade,cbd2$Group.1),"x"]
  ca_long$Share <- round(100*(ca_long$Count/ca_long$Total),1)
  ca_long$AREA4 <- factor(ca_long$AREA,ordered=TRUE,levels=c("Government", "Economy", "<Functional>", "Geopolitical", "Epidemic", "Disaster", "Military", "Transport", "Justice", "Leisure", "Energy", "Labor", "Health", "Society", "Education", "Public", "Family", "Science", "Welfare", "Technology"))


  wavedata.vol50$ECOCRISIS <- ifelse(wavedata.vol50$AREA=="Economy", "Economic crisis", "Other crisis type")

  EconomicCrisisVolume <- data.frame(year=seq(1788,2020,1))
  count <- with(subset(wavedata.vol50,ECOCRISIS=="Economic crisis"),aggregate(volume,by=list(year),FUN="sum"),drop=FALSE)
  EconomicCrisisVolume$Count <- count[match(EconomicCrisisVolume$year,as.numeric(count$Group.1)),"x"]
  EconomicCrisisVolume$Count <- replace(EconomicCrisisVolume$Count,is.na(EconomicCrisisVolume$Count),0)
  EconomicCrisisVolume$SmoothCount <- rollmean(EconomicCrisisVolume$Count,3,align="center",fill=c(0,NA,0)) 
  EconomicCrisisVolume$Expected <- rollmean(EconomicCrisisVolume$Count,21,align="center",fill=c(1,NA,93)) 

  x <- with(wavedata.vol50,aggregate((ECOCRISIS=="Economic crisis"),by=list(year),FUN="sum"))
  y <- with(wavedata.vol50,aggregate((ECOCRISIS=="Other crisis type"),by=list(year),FUN="sum"))
  EconomicCrisisVolume$ECOCRISIS <- x[match(EconomicCrisisVolume$year,x$Group.1),"x"]
  EconomicCrisisVolume$OTHERCRISIS <- y[match(EconomicCrisisVolume$year,y$Group.1),"x"]
  EconomicCrisisVolume$ECOCRISIS <- replace(EconomicCrisisVolume$ECOCRISIS,is.na(EconomicCrisisVolume$ECOCRISIS),0)
  EconomicCrisisVolume$OTHERCRISIS <- replace(EconomicCrisisVolume$OTHERCRISIS,is.na(EconomicCrisisVolume$OTHERCRISIS),0)

#  summary(lm(OTHERCRISIS~1+ECOCRISIS+SmoothCount+Expected+year,data=EconomicCrisisVolume))
#  summary(lm(ECOCRISIS~1+OTHERCRISIS+SmoothCount+Expected+year,data=EconomicCrisisVolume))

#  summary(lm(OTHERCRISIS~1+ECOCRISIS*(year>1950),data=EconomicCrisisVolume))
#  summary(lm(ECOCRISIS~1+OTHERCRISIS*(year>1950),data=EconomicCrisisVolume))

#  ggplot(EconomicCrisisVolume,aes(x=year,y=OTHERCRISIS,color=ECOCRISIS,size=ECOCRISIS))+geom_point()+theme_light()+
#    scale_x_continuous(breaks=seq(1780,2020,10))

#  ggplot(EconomicCrisisVolume,aes(x=year,y=OTHERCRISIS,color=ECOCRISIS,size=Count))+geom_point()+theme_light()+
#    scale_x_continuous(breaks=seq(1780,2020,10))

  #wavedata.vol50$EcoVol <- EconomicCrisisVolume[match(wavedata.vol50$year,EconomicCrisisVolume$Group.1),"x"]
  #wavedata.vol50$EcoVolSmooth <- EconomicCrisisVolume[match(wavedata.vol50$year,EconomicCrisisVolume$Group.1),"SmoothCount"]
  #wavedata.vol50$EcoVolPlus <- EconomicCrisisVolume[match(wavedata.vol50$year,EconomicCrisisVolume$Group.1),"x"]-EconomicCrisisVolume[match(wavedata.vol50$year,EconomicCrisisVolume$Group.1),"Expected"]
  #wavedata.vol50$EcoVolSmoothPlus <- EconomicCrisisVolume[match(wavedata.vol50$year,EconomicCrisisVolume$Group.1),"SmoothCount"]-EconomicCrisisVolume[match(wavedata.vol50$year,EconomicCrisisVolume$Group.1),"Expected"]


  #ggplot(ca_long,aes(x=Decade,y=Count,group=AREA4,fill=AREA4))+geom_area()+theme_light()
  #ggplot(ca_long,aes(x=Decade,y=Share,group=AREA4,fill=AREA4))+geom_col()+theme_light()+scale_fill_manual(values=c("#000000", "red", "#111111", "#222222", "#333333", "#444444", "#555555", "#666666",
  # "#777777", "#888888", "#999999", "#aaaaaa", "#bbbbbb", "#cccccc", "#dddddd", "#eeeeee", "#ffffff", "#121212", "#232323", "#343434", "#454545", "#565656", "#676767"))


# Raw count of economic crises
  gg_freq_eco_abs <- ggplot(subset(ca_long,AREA4=="Economy"),aes(x=Decade,y=Count))+geom_point()+geom_smooth()+scale_x_continuous(breaks=seq(1800,2020,10))+theme_light()+ylab("Absolute number of economic crises")+xlab("Decade")

  gg_freq_eco_rel <- ggplot(ba_long,aes(x=Decade,y=Index,color=CrisisType,group=CrisisType))+geom_point()+geom_smooth(se=FALSE)+    ylim(0,100)+scale_x_continuous(breaks=seq(1800,2020,10))+scale_color_viridis_d(option="plasma",begin=.0,end=.5)+ylab("Frequency of crisis index\n(100=maximum observed value for the respective crisis type)")+theme_light()

  ggplot(wavedata.vol50,aes(color=(AREA=="Economy"),x=year,y=volume))+geom_point(aes(size=volume))+geom_smooth()
  ggplot(wavedata.vol50,aes(color=(AREA=="Economy"),x=year,y=volume))+geom_point(aes(size=volume))+geom_smooth()

  ggplot(wavedata.vol50,aes(x=YEAR,y=volume,color=(AREA=="Economy"),fill=(AREA=="Economy")))+geom_point(aes(size=volume))+geom_smooth()+facet_grid(AREA=="Economy"~.)+theme_light()+ylab("Volume of coverage about a crisis event")+scale_x_continuous(breaks=seq(1800,2020,10))+scale_color_viridis_d(option="plasma",begin=.0,end=.5)+scale_fill_viridis_d(option="plasma",begin=.0,end=.5)

  gg_crisis_volume <- ggplot(wavedata.vol50,aes(x=as.numeric(year),y=volume,group=ECOCRISIS,color=ECOCRISIS,fill=ECOCRISIS))+geom_jitter(aes(size=volume))+geom_smooth()+theme_light()+ylab("Volume of coverage about a crisis event")+scale_x_continuous(breaks=seq(1800,2020,10))+scale_color_viridis_d(option="plasma",begin=.0,end=.5)+scale_fill_viridis_d(option="plasma",begin=.0,end=.5)+xlab("Year")

  gg_crisis_duration <- ggplot(wavedata.vol50,aes(x=as.numeric(year),y=duration,group=ECOCRISIS,color=ECOCRISIS,fill=ECOCRISIS))+geom_jitter(aes(size=duration),alpha=.2)+geom_smooth()+theme_light()+ylab("Duration of coverage about a crisis event")+scale_x_continuous(breaks=seq(1800,2000,10))+scale_color_viridis_d(option="plasma",begin=.0,end=.5)+scale_fill_viridis_d(option="plasma",begin=.0,end=.5)+xlab("Year")

  ggplot(subset(wavedata.vol50,ECOCRISIS=="Economic crisis"),aes(x=as.numeric(year),y=duration))+geom_jitter(aes(size=duration))+geom_smooth()+theme_light()+ylab("Duration of coverage about a crisis event")+scale_x_continuous(breaks=seq(1800,2000,10))+scale_color_viridis_d(option="plasma",begin=.0,end=.5)+scale_fill_viridis_d(option="plasma",begin=.0,end=.5)+xlab("Year")

  xlm1 <- (lm(duration~I(as.numeric(year)),data=wavedata.vol50))
  summary(xlm1)
  predict(xlm1,newdata=data.frame(year=c(1800,1850,1900,1950,2000,2020)))

  xlm2 <- (lm(duration~I(as.numeric(year)),data=subset(wavedata.vol50,AREA3=="ECONOMY")))
  summary(xlm2)
  predict(xlm2,newdata=data.frame(year=c(1800,1850,1900,1950,2000,2020)))

  xlm3 <- (lm(volume~I(as.numeric(year)),data=wavedata.vol50))
  summary(xlm3)
  predict(xlm3,newdata=data.frame(year=c(1800,1850,1900,1950,2000,2020)))

  xlm4 <- (lm(volume~I(as.numeric(year)),data=subset(wavedata.vol50,AREA3=="ECONOMY")))
  summary(xlm4)
  predict(xlm4,newdata=data.frame(year=c(1800,1850,1900,1950,2000,2020)))


  eco_share <- data.frame(tapply(wavedata.vol50$AREA=="Economy",wavedata.vol50$decade,FUN="sum")/table(wavedata.vol50$decade))

  # Share of economic crisis among all crises
  ggplot(eco_share,aes(x=as.numeric(Var1),y=Freq))+geom_point()+ylim(0,1)+geom_smooth(method="lm")

  table(subset(wavedata.vol50,AREA3=="ECONOMY")$decade)/table(wavedata.vol50$decade)


  for (i in 1:length(wave.keywords))
    {
      p1 <- if (any(textfiles.1$id==wave.keywords[[i]]$texts$id[1],na.rm=T))    {(data.frame(start=wave.keywords[[i]]$enddate,end=wave.keywords[[i]]$enddate,textfiles.1[which(textfiles.1$id==wave.keywords[[i]]$texts$id[1]),c("date", "newspaper", "headline")]))}
      p2 <- if (any(textfiles.1$id==wave.keywords[[i]]$texts$id[1],na.rm=T)) {(paste("\n\n",textfiles.1[which(textfiles.1$id==wave.keywords[[i]]$texts$id[1]),c("text")],"\n\n"))}

      p3 <- if (any(textfiles.1$id==wave.keywords[[i]]$texts$id[1],na.rm=T)) {print(kwic(corpus_subset(cp,id==wave.keywords[[i]]$texts$id[1]),pattern=myKeys,window=15))}
        
      pp3 <- paste(p3[,2],p3[,3],p3[,4],p3[,5],p3[,6])
        
      pp <- paste(p1[,1],p1[,2],p1[,3],p1[,4],p1[,5],"\n\n",p2,"\n\n",paste(pp3,collapse=" \n "),"\n\n\n\n\n")

      filename1 <- paste0("et",ifelse(i>999,"0",ifelse(i>99,"00",ifelse(i>9,"000", "0000"))),i,"-1", ".txt")
      write.csv(pp,file=filename1)
    }

  ggplot(wavedata.vol50,aes(ymin=as.numeric(AREA),ymax=as.numeric(AREA)+intensity/5,xmin=as.numeric(year),xmax=as.numeric(year)+10*duration/365,fill=AREA))+geom_rect(alpha=.5)+theme_light()+geom_text(hjust=0,aes(label=AREA,color=AREA,x=1790,y=as.numeric(AREA)+0))+scale_x_continuous(breaks=seq(1780,2020,10))


  w <- 1
  stri_split_lines(paste(wavedata.vol50[w,c("year")],wavedata.vol50[w,c("TOPIC")],round(wavedata.vol50[w,c("volume")],1),wavedata.vol50[w,c("duration")],wave.keywords[[w]]$startdate,textfiles.1[which(textfiles.1$id==wave.keywords[[w]]$texts$id[w]),"text"]))

  textfiles.1[textfiles.1$id=="CS185787643"]

  wavedata.vol50$YEAR <- as.numeric(wavedata.vol50$year)
  wavedata.vol50$DECADE <- floor(wavedata.vol50$YEAR/10)*10

  eventcount <- aggregate(wavedata.vol50$TOPIC,by=list(wavedata.vol50$year),FUN=length)
  unique_eventcount <- aggregate(wavedata.vol50$TOPIC,by=list(wavedata.vol50$year),FUN=uniq_count)
  names(unique_eventcount) <- c("year", "UniqueTopicsCovered")

  eventcount_decade <- aggregate(wavedata.vol50$TOPIC,by=list(wavedata.vol50$decade),FUN=length)
  unique_eventcount_decade <- aggregate(wavedata.vol50$TOPIC,by=list(wavedata.vol50$decade),FUN=uniq_count)
  names(unique_eventcount_decade) <- c("decade", "UniqueTopicsCovered")

  eventcount_decade_area <- aggregate(wavedata.vol50$AREA,by=list(wavedata.vol50$decade),FUN=length)
  unique_eventcount_decade_area <- aggregate(wavedata.vol50$AREA,by=list(wavedata.vol50$decade),FUN=uniq_count)
  names(unique_eventcount_decade_area) <- c("decade", "UniqueAreasCovered")

  gg_topicspectrum.event.decades <-  ggplot(unique_eventcount_decade,aes(x=decade,y=UniqueTopicsCovered))+geom_point(size=2.5)+geom_smooth(method="lm",fill="#44aa66",color="#44aa66")+geom_smooth(fill="#dd1c77",color="#dd1c77",linetype="longdash")+geom_hline(yintercept=120,color="red")+theme_light()+ylim(0,120)+ylab("Topics covered by crisis events in the decade")+xlab("Decade")

  gg_areaspectrum.event.decades <-  ggplot(unique_eventcount_decade_area,aes(x=decade,y=UniqueAreasCovered))+geom_point(size=2.5)+geom_smooth(method="lm",fill="#44aa66",color="#44aa66")+geom_smooth(fill="#dd1c77",color="#dd1c77",linetype="longdash")+geom_hline(yintercept=39,color="red")+theme_light()+ylim(0,39)+ylab("Areas covered by crisis events in the decade")+xlab("Decade")

#  ggsave(gg_topicspectrum.event.decades,file="topicspectrum_event_decade.svg",units="cm",width=16,height=12,dpi=1200,scale=1.00)
#  ggsave(gg_areaspectrum.event.decades,file="areaspectrum_event_decade.svg",units="cm",width=16,height=12,dpi=1200,scale=1.00)


  crisisarticles <- aggregate(wavedata.vol50$volume,by=list(wavedata.vol50$year),FUN=sum)

  crisisarticles$year <- as.numeric(crisisarticles$Group.1)
  crisisarticles$volume <- crisisarticles$x

  crisisarticles.time <- ggplot(crisisarticles,aes(y=volume,x=year,label=year,size=(volume)))+geom_point(color="gray",shape=17)+geom_text(color="black")+geom_smooth(linetype="longdash",fill="#44aa66",color="#44aa66",method="lm",na.rm=TRUE)+geom_smooth(fill="#dd1c77",color="#dd1c77",na.rm=TRUE)+xlab("Year")+ylab("Number of articles assigned to crisis events per year")+theme_light()+theme(legend.position="none")
  crisisarticles.time <- ggplot(crisisarticles,aes(y=volume,x=year,label=year,size=(volume)))+geom_point(color="gray",shape=17)+geom_text(color="black")+geom_smooth(linetype="longdash",fill="#44aa66",color="#44aa66",method="lm",na.rm=TRUE)+geom_smooth(fill="#dd1c77",color="#dd1c77",na.rm=TRUE)+xlab("Year")+ylab("Number of articles assigned to crisis events per year")+theme_light()+theme(legend.position="none")+
  scale_y_continuous(trans="log10")

  crisisarticles.lm <- lm(volume~year,data=crisisarticles)
  # predict(crisisarticles.lm,newdata=list(year=c(1800,1850,1900,1950,2000,2050)))

  # ggsave(crisisarticles.time,file="crisisarticles+time.svg",dpi=1200,unit="cm",width=16,height=12,scale=1.0)

  eventcount <- aggregate(wavedata.vol50$duration/wavedata.vol50$duration,by=list(wavedata.vol50$year),FUN=sum)

  eventcount$year <- as.numeric(eventcount$Group.1)
  eventcount$count <- eventcount$x

  eventcount.time <- ggplot(eventcount,aes(y=count,x=year,label=year,size=count))+geom_point(shape=17,color="gray")+geom_text(color="black")+geom_smooth(linetype="longdash",fill="#44aa66",color="#44aa66",method="lm",na.rm=TRUE)+geom_smooth(fill="#dd1c77",color="#dd1c77",na.rm=TRUE)+xlab("Year")+ylab("Number of crisis events identified")+theme_light()+theme(legend.position="none")

  eventcount.lm <- lm(count~year,data=eventcount)
  # predict(eventcount.lm,newdata=list(year=c(1800,1850,1900,1950,2000,2050)))

  # summary(lm(x~as.numeric(Group.1),data=x))

  # ggsave(eventcount.time,file="eventcount+time.svg",dpi=1200,unit="cm",width=16,height=12,scale=1.0)

  subset(wavedata.vol50,volume>100)

  crisisplot <- ggplot(wavedata.vol50,aes(color=topic,fill=topic,xmin=as.POSIXct(start*60*60*24,origin="0000-01-01"),xmax=as.POSIXct((start+duration*5)*60*60*24,origin="0000-01-01"),ymin=as.numeric(topic),ymax=as.numeric(topic)+2*intensity))+
                    geom_rect()+scale_fill_viridis_d(na.value="grey80")+scale_color_viridis_d(na.value="grey80")+theme_light()+ylim(0,120)+ylab("Topic ID")+xlab("Year")+theme(legend.position="none")

    ggsave(crisisplot,file="crisisdensity+time.svg",dpi=1200,unit="cm",width=16,height=12,scale=1.0)

  years50 <- c(1800,1850,1900,1950,2000,2050)

  wavedata.vol50$YEAR <- as.numeric(wavedata.vol50$year)

  wavedata.vol50$TotalNewspaperVolume <- thetimes[match(wavedata.vol50$year,thetimes$year),"articles"]

  volume.lm <- (lm(volume~(YEAR),data=wavedata.vol50))
  predict(volume.lm,newdata=list(YEAR=years50))

  duration.lm <- (lm(duration~(YEAR),data=wavedata.vol50))
  predict(duration.lm,newdata=list(YEAR=years50))

  intensity.lm <- (lm(intensity~(YEAR),data=wavedata.vol50))
  predict(intensity.lm,newdata=list(YEAR=years50))

  max.intensity.lm <- (lm(max.intensity~(YEAR),data=wavedata.vol50))
  predict(max.intensity.lm,newdata=list(YEAR=years50))

  variability.lm <- (lm(variability~(YEAR),data=wavedata.vol50))
  predict(variability.lm,newdata=list(YEAR=years50))

#  summary(lm(duration~I(as.numeric(year)-1788),data=wavedata.vol50))
#  summary(lm(intensity~I(as.numeric(year)-1788),data=wavedata.vol50))
#  summary(lm(max.intensity~I(as.numeric(year)-1788),data=wavedata.vol50))
#  summary(lm(variability~I(as.numeric(year)-1788),data=wavedata.vol50))
#  summary(lm((variability/intensity)~I(as.numeric(year)-1788),data=wavedata.vol50))
#  summary(lm(baseline365~I(as.numeric(year)-1788),data=wavedata.vol50))

  intensity.trajectory <- ggplot(wd5,aes(x=as.numeric(year),y=intensity))+geom_jitter(size=0.5)+geom_smooth(linetype="longdash",fill="#44aa66",color="#44aa66",method="lm",na.rm=TRUE)+geom_smooth(fill="#dd1c77",color="#dd1c77")+theme_light()+xlab("Year")+ylab("Average daily intensity of coverage\nduring the identified crisis case")+
  scale_y_continuous(trans="log10")

    ggsave(intensity.trajectory,file="intensity+time.svg",dpi=1200,unit="cm",width=16,height=12,scale=1.0)

  duration.trajectory <- ggplot(wd5,aes(x=as.numeric(year),y=duration))+geom_jitter(size=0.5)+geom_smooth(linetype="longdash",fill="#44aa66",color="#44aa66",method="lm",na.rm=TRUE)+geom_smooth(fill="#dd1c77",color="#dd1c77")+theme_light()+xlab("Year")+ylab("Average duration of above-baseline coverage\nduring the identified crisis case")+ylim(0,160)

    ggsave(duration.trajectory,file="duration+time.svg",dpi=1200,unit="cm",width=16,height=12,scale=1.0)

  volume.trajectory <- ggplot(wavedata.vol50,aes(x=as.numeric(year),y=volume))+geom_jitter(size=0.5)+geom_smooth(linetype="longdash",fill="#44aa66",color="#44aa66",method="lm",na.rm=TRUE)+geom_smooth(fill="#dd1c77",color="#dd1c77")+theme_light()+xlab("Year")+ylab("Total volume of coverage\nduring the identified crisis case")+ylim(0,500)+
  scale_y_continuous(trans="log10")

    ggsave(volume.trajectory,file="volume+time.svg",dpi=1200,unit="cm",width=16,height=12,scale=1.0)

  peak.trajectory <- ggplot(wavedata.vol50,aes(x=as.numeric(year),y=max.intensity))+geom_jitter(size=0.5)+geom_smooth(linetype="longdash",fill="#44aa66",color="#44aa66",method="lm",na.rm=TRUE)+geom_smooth(fill="#dd1c77",color="#dd1c77")+theme_light()+xlab("Year")+ylab("Maximum amount of coverage\nduring the identified crisis case")+
  scale_y_continuous(trans="log10",limits=c(0.1,31))

    ggsave(peak.trajectory,file="peak+time.svg",dpi=1200,unit="cm",width=16,height=12,scale=1.0)

  relative.variability.trajectory <- ggplot(wavedata.vol50,aes(x=as.numeric(year),y=variability/intensity))+geom_point()+geom_smooth()+theme_light()

  variability.trajectory <- ggplot(wavedata.vol50,aes(x=as.numeric(year),y=variability))+geom_point()+geom_smooth()+theme_light()

  # save(wavedata,file="wavedata.RData")
  # save(wavedata.vol50,file="wavedata_vol50.RData")
  # save(wave.keywords,file="wave_keywords.RData")
  # save(tss,file="tss_total.RData")


  textdb <- data.frame(aby[[1]])

  for (i in 2:length(aby))
    {
      textdb <- data.frame(rbind(textdb,data.frame(aby[[i]])))
    }

  addtextdb <- ftx[ftx$id%in%docvars(cp)$id,]
  addtextdb$date <- str_replace(addtextdb$date,"März", "March")

  addtextdb2 <- data.frame(doc_id=addtextdb$id,text=addtextdb$text,id=addtextdb$id,date=addtextdb$date,newspaper="The Times.",headline=addtextdb$headline,genre=NA,place=NA,publisher=NA,author=addtextdb$author,archive="FACTIVA",type=NA,year=addtextdb$year,day=str_extract(addtextdb$date,"^[:digit:]{1,2}"),month=str_extract(addtextdb$date,"[:alpha:]{3,3}"),Month=NA,time=NA,Time=NA)
  addtextdb2$Month <- as.numeric(Recode(addtextdb2$month,"'Jan'=1;'Feb'=2;'Mar'=3;'Apr'=4;'Mai'=5;'Jun'=5;'Jul'=7;'Aug'=8;'Sep'=9;'Okt'=10;'Nov'=11;'Dez'=12"))
  addtextdb2$Time <- (paste0(addtextdb2$year,"-",addtextdb2$Month,"-",addtextdb2$day))
  addtextdb2$time <- (paste0(addtextdb2$year,"-",addtextdb2$month,"-",addtextdb2$day))

  tdb <- rbind(textdb,addtextdb2)

  textdirectory1 <- NA
  textdirectory2 <- NA

  cpa <- corpus(x=as.character(tdb$text))

  docvars(cpa,"doc_id") <- tdb[,c("doc_id")]
  docvars(cpa,"id") <- tdb[,c("id")]
  docvars(cpa,"date") <- tdb[,c("date")]
  docvars(cpa,"newspaper") <- tdb[,c("newspaper")]
  docvars(cpa,"headline") <- tdb[,c("headline")]
  docvars(cpa,"genre") <- tdb[,c("genre")]
  docvars(cpa,"place") <- tdb[,c("place")]
  docvars(cpa,"publisher") <- tdb[,c("publisher")]
  docvars(cpa,"author") <- tdb[,c("author")]
  docvars(cpa,"archive") <- tdb[,c("archive")]
  docvars(cpa,"type") <- tdb[,c("type")]
  docvars(cpa,"year") <- tdb[,c("year")]
  docvars(cpa,"day") <- tdb[,c("day")]
  docvars(cpa,"month") <- tdb[,c("month")]
  docvars(cpa,"time") <- tdb[,c("time")]
  docvars(cpa,"Time") <- tdb[,c("Time")]
  docvars(cpa,"Month") <- tdb[,c("Month")]

  cleaner <- match(out.selected3$meta$id,docvars(cp)$id)
  cpx <- cp[cleaner]

  cleaner <- match(out.selected3$meta$id,docvars(cpa)$id)
  cpx <- cpa[cleaner]

######################
######################
# GET A STRATIFIED RANDOM SAMPLE OF TEXTS

  texts_1788_1825 <- corpus_subset(cpx,year>1787 & year<1826)
  texts_1826_1850 <- corpus_subset(cpx,year>1825 & year<1851)
  texts_1851_1875 <- corpus_subset(cpx,year>1850 & year<1876)
  texts_1876_1900 <- corpus_subset(cpx,year>1875 & year<1901)
  texts_1901_1925 <- corpus_subset(cpx,year>1900 & year<1926)
  texts_1926_1950 <- corpus_subset(cpx,year>1925 & year<1951)
  texts_1951_1975 <- corpus_subset(cpx,year>1950 & year<1976)
  texts_1976_2000 <- corpus_subset(cpx,year>1975 & year<2001)
  texts_2001_2020 <- corpus_subset(cpx,year>2000 & year<2026)

  ndoc_1788_1825 <- length(texts_1788_1825) # 1347
  ndoc_1826_1850 <- length(texts_1826_1850) # 1305 
  ndoc_1851_1875 <- length(texts_1851_1875) # 2801
  ndoc_1876_1900 <- length(texts_1876_1900) # 7539
  ndoc_1901_1925 <- length(texts_1901_1925) # 10398
  ndoc_1926_1950 <- length(texts_1926_1950) # 11370
  ndoc_1951_1975 <- length(texts_1951_1975) # 19364
  ndoc_1976_2000 <- length(texts_1976_2000) # 34092
  ndoc_2001_2020 <- length(texts_2001_2020) # 42236

  sample_size <- 10

  sel_1788_1825 <- round(runif(min=0.5,max=ndoc_1788_1825+0.5,n=sample_size))
  sel_1826_1850 <- round(runif(min=0.5,max=ndoc_1826_1850+0.5,n=sample_size))
  sel_1851_1875 <- round(runif(min=0.5,max=ndoc_1851_1875+0.5,n=sample_size))
  sel_1876_1900 <- round(runif(min=0.5,max=ndoc_1876_1900+0.5,n=sample_size))
  sel_1901_1925 <- round(runif(min=0.5,max=ndoc_1901_1925+0.5,n=sample_size))
  sel_1926_1950 <- round(runif(min=0.5,max=ndoc_1926_1950+0.5,n=sample_size))
  sel_1951_1975 <- round(runif(min=0.5,max=ndoc_1951_1975+0.5,n=sample_size))
  sel_1976_2000 <- round(runif(min=0.5,max=ndoc_1976_2000+0.5,n=sample_size))
  sel_2001_2020 <- round(runif(min=0.5,max=ndoc_2001_2020+0.5,n=sample_size))

  textout <- list()

  textout[[1]] <- texts_1788_1825[sel_1788_1825]
  textout[[2]] <- texts_1826_1850[sel_1826_1850]
  textout[[3]] <- texts_1851_1875[sel_1851_1875]
  textout[[4]] <- texts_1876_1900[sel_1876_1900]
  textout[[5]] <- texts_1901_1925[sel_1901_1925]
  textout[[6]] <- texts_1926_1950[sel_1926_1950]
  textout[[7]] <- texts_1951_1975[sel_1951_1975]
  textout[[8]] <- texts_1976_2000[sel_1976_2000]
  textout[[9]] <- texts_2001_2020[sel_2001_2020]


  for (i in 1:9)
    {
      for (w in 1:sample_size)
        {
          position <- match(docvars(textout[[i]][w])$id,docvars(cpx)$id)
          topicno <- which(STMr$theta[position,]==max(STMr$theta[position,]))
          text1 <-  paste("ID=:",docvars(textout[[i]][w])$id,"\n",
                    "Share of topic:",round(STMr$theta[position,topicno],3),"\n",
                    "Keyword list:",paste(summary(STMr)$score[topicno,],collapse=" "),"\n",
                    "Newspaper:",docvars(textout[[i]][w])$newspaper,"\n",
                    "Publication date:",docvars(textout[[i]][w])$date,"\n",
                    "Headline:",docvars(textout[[i]][w])$headline,"\n",           
                    "Text:",paste(textout[[i]][w]),"\n",    
                    "KWIC:",paste(kwic(textout[[i]][w],myKeys)$pre,str_to_upper(kwic(textout[[i]][w],myKeys)$keyword),kwic(textout[[i]][w],myKeys)$pre,"\n",collapse=" "))
          filename1 <- paste0("srs", "-",i,"-",w,".txt")
          write.csv(text1,file=filename1)
            print(paste0(i,"/",9))
            flush.console()
            print(paste0(w,"/",sample_size))
            flush.console()
        }
    }


######################
######################
# GET A RANDOM SAMPLE OF TEXTS

  sample_size <- 100

  sel_total <- round(runif(min=0.5,max=length(cp)+0.5,n=sample_size))

  textout.total <- cpx[sel_total]

  for (w in 1:length(sel_total))
    {
      position <- match(docvars(textout.total[w])$id,docvars(cpx)$id)
      topicno <- which(STMr$theta[position,]==max(STMr$theta[position,]))
      text1 <-  paste("ID=:",docvars(textout.total[w])$id,"\n",
                "Share of topic:",round(STMr$theta[position,topicno],3),"\n",
                "Keyword list:",paste(summary(STMr)$score[topicno,],collapse=" "),"\n",
                "Newspaper:",docvars(textout.total[w])$newspaper,"\n",
                "Publication date:",docvars(textout.total[w])$date,"\n",
                "Headline:",docvars(textout.total[w])$headline,"\n",              
                "Text:",paste(textout.total[w]),"\n",   
                "KWIC:",paste(kwic(textout.total[w],myKeys)$pre,str_to_upper(kwic(textout.total[w],myKeys)$keyword),kwic(textout.total[w],myKeys)$pre,"\n",collapse=" "))
      filename1 <- paste0("rs", "-",w,".txt")
      write.csv(text1,file=filename1)
        print(paste0(w,"/",sample_size))
        flush.console()
    }


######################
######################
# GET EXAMPLE TEXTS FOR TOPICS

  issueprobs <- data.frame(STMr$theta)

  issueprobs$Disaster <- rowSums(issueprobs[,c("X24", "X42", "X49", "X52", "X53", "X63", "X84", "X86", "X111")]) # suitable
  issueprobs$Economy <- rowSums(issueprobs[,c("X4", "X15", "X20", "X23", "X34", "X39", "X50", "X54", "X66", "X70", "X75", "X79", "X80", "X87", "X89", "X91", "X96", "X98", "X100", "X104", "X106", "X114", "X115")]) # suitable
  issueprobs$Education <-  rowSums(issueprobs[,c("X44", "X72", "X101")]) # suitable
  issueprobs$Energy <- rowSums(issueprobs[,c("X29", "X60")]) # suitable
  issueprobs$Epidemics <- rowSums(issueprobs[,c("X16", "X25", "X61", "X103")]) # suitable
  issueprobs$Family <- rowSums(issueprobs[,c("X14", "X69")]) # suitable
  issueprobs$Functional <- rowSums(issueprobs[,c("X5", "X10", "X12", "X32", "X35", "X37", "X45", "X67", "X76", "X81", "X90", "X93", "X119")]) # no issues
  issueprobs$Geopolitical <- rowSums(issueprobs[,c("X1", "X11", "X19", "X28", "X31", "X64", "X65", "X68", "X71", "X78", "X85", "X88", "X95", "X99", "X107", "X108", "X113", "X116")]) # suitable, most of "location" moved to geopolitical
  issueprobs$Government <- rowSums(issueprobs[,c("X6", "X22", "X40", "X43", "X56", "X58", "X102", "X109", "X117")]) #suitable
  issueprobs$Health <- rowSums(issueprobs[,c("X30", "X57")]) # suitable
  issueprobs$Justice <- rowSums(issueprobs[,c("X48", "X92", "X97", "X110")]) #suitable
  issueprobs$Labor <- rowSums(issueprobs[,c("X13", "X62")]) # suitable, could be merged with economy
  issueprobs$Leisure <- rowSums(issueprobs[,c("X7", "X21", "X27", "X59", "X82", "X94")]) # suitable
  issueprobs$Location <- (issueprobs[,c("X112")]) # suitable, less significant
  issueprobs$Military <- rowSums(issueprobs[,c("X18", "X26", "X51", "X73", "X118")]) # suitable
  issueprobs$Public <- rowSums(issueprobs[,c("X47", "X77", "X120")]) # suitable
  issueprobs$ScienceTech <- rowSums(issueprobs[,c("X36", "X8", "X46")]) # suitable, science merged with technology
  issueprobs$Society <- rowSums(issueprobs[,c("X2", "X3", "X55")]) # suitable
  issueprobs$Transport <- rowSums(issueprobs[,c("X33", "X41", "X74", "X83")]) # suitable
  issueprobs$Welfare <- (issueprobs[,c("X105")])

  issueprobs$id <- out.selected3$meta$id

  topic_areas <- c("Disaster", "Economy", "Education", "Energy", "Epidemics", "Family", "Functional", "Geopolitical", "Government", "Health", "Justice", "Labor", "Leisure", "Location", "Military", "Public", "ScienceTech", "Society", "Transport", "Welfare")

  topic_areas <- c("Disaster", "Economy", "Education", "Energy", "Epidemics", "Family", "Geopolitical", "Government", "Health", "Justice", "Labor", "Leisure", "Military", "Public", "ScienceTech", "Society", "Transport", "Welfare")

  I <- length(topic_areas)

  docvars(cpx)$Disaster <- issueprobs$Disaster
  docvars(cpx)$Economy <- issueprobs$Economy
  docvars(cpx)$Education <- issueprobs$Education
  docvars(cpx)$Energy <- issueprobs$Energy
  docvars(cpx)$Epidemics <- issueprobs$Epidemics
  docvars(cpx)$Family <- issueprobs$Family
  docvars(cpx)$Functional <- issueprobs$Functional
  docvars(cpx)$Geopolitical <- issueprobs$Geopolitical
  docvars(cpx)$Government <- issueprobs$Government
  docvars(cpx)$Health <- issueprobs$Health
  docvars(cpx)$Justice <- issueprobs$Justice
  docvars(cpx)$Labor <- issueprobs$Labor
  docvars(cpx)$Leisure <- issueprobs$Leisure
  docvars(cpx)$Location <- issueprobs$Location
  docvars(cpx)$Military <- issueprobs$Military
  docvars(cpx)$Public <- issueprobs$Public
  docvars(cpx)$ScienceTech <- issueprobs$ScienceTech
  docvars(cpx)$Society <- issueprobs$Society
  docvars(cpx)$Transport <- issueprobs$Transport
  docvars(cpx)$Welfare <- issueprobs$Welfare

  texts_1788_1899 <- corpus_subset(cpx,year<1900)
  texts_1900_1979 <- corpus_subset(cpx,year>1899&year<1980)
  texts_1980_2020 <- corpus_subset(cpx,year>1979)

  era_list <- c("texts_1785_1899", "texts_1900_1979", "texts_1980_2020")
  J <- length(era_list)

  for (i in 1:I)
    {
      for (j in 1:J)
        {
          corp <- get(era_list[j])
          pos <- which(docvars(corp)[,topic_areas[i]]==max(docvars(corp)[,topic_areas[i]]))
          top10 <- docvars(corp)[,topic_areas[i]][order(docvars(corp)[,topic_areas[i]],decreasing=TRUE)][10]
          
          text1 <-  paste("ID=:",docvars(corp)$id[pos],"\n",
                    "Share of topic:",round(docvars(corp)[pos,topic_areas[i]],3),"\n",
                    "Topic complex:",topic_areas[i],"\n",
                    "Newspaper:",docvars(corp)$newspaper[pos],"\n",
                    "Publication date:",docvars(corp)$date[pos],"\n",
                    "Headline:",docvars(corp)$headline[pos],"\n",             
                    "Text:",paste(corp[pos]),"\n",  
                    "KWIC:",paste(kwic(corp[pos],myKeys,window=15)$pre,str_to_upper(kwic(corp[pos],myKeys,window=15)$keyword),kwic(corp[pos],myKeys,window=15)$pre,"\n",collapse=" "),
                    "Top 10 headlines:",paste(paste(docvars(corp)$headline[docvars(corp)[,topic_areas[i]]>top10],"\n"),collapse=" "))
          filename1 <- paste0("to", "-",i,"-",j,".txt")
          write.csv(text1,file=filename1)
            print(paste0(i,"/",I))
            print(paste0(j,"/",J))
            flush.console()
        }
    }

######################
######################
# GET EXAMPLE TEXTS FROM THE MAJOR TOPICAL CRISES

  docvars(cpx)$Month <- Recode(str_extract(docvars(cpx)$month,"^.{3,3}"),"'Jan'=1;'Feb'=2;'Mar'=3;'Apr'=4;'May'=5;'Jun'=6;'Jul'=7;'Aug'=8;'Sep'=9;'Oct'=10;'Nov'=11;'Dec'=12")

  docvars(cpx)$Day <- as.numeric(ifelse(docvars(cpx)$year>2014,str_extract(docvars(cpx)$time,"^[:digit:]{1,2}"),docvars(cpx)$day))

  docvars(cpx)$date <- paste0(docvars(cpx)$year,"-",docvars(cpx)$Month,"-",docvars(cpx)$Day)
  docvars(cpx)$Date <- as.POSIXct(as.Date(docvars(cpx)$date))

  wavedata2$AREA_i <- Recode(wavedata2$AREA,"'Epidemic'='Epidemics';'Science'='ScienceTech';'Technology'='ScienceTech'")

  wave_1788 <- subset(wavedata2,decade<1900)
  wave_1900 <- subset(wavedata2,decade<1980 & decade>1890)
  wave_1980 <- subset(wavedata2,decade>1970)

  texts_1788_1899 <- corpus_subset(cpx,year<1900)
  texts_1900_1979 <- corpus_subset(cpx,year>1899&year<1980)
  texts_1980_2020 <- corpus_subset(cpx,year>1979)

  topic_areas <- c("Disaster", "Economy", "Education", "Energy", "Epidemics", "Family", "Geopolitical", "Government", "Health", "Justice", "Labor", "Leisure", "Military", "Public", "ScienceTech", "Society", "Transport", "Welfare")

  text_list <- c("texts_1788_1899", "texts_1900_1979", "texts_1980_2020")
  wave_list <- c("wave_1788", "wave_1900", "wave_1980")

  I <- length(topic_areas)
  J <- length(wave_list)

  for (i in 1:I){
    for (j in 1:J){
    txt <-  get(text_list[j])
    wave <- get(wave_list[j])
    if(any(wave$AREA_i==topic_areas[i],na.rm=TRUE)){
      wav  <- subset(wave,AREA_i==topic_areas[i])
      wa   <- wav[wav$volume==max(wav$volume),]
      index <- which(wavedata2$volume==max(wav$volume)&wavedata2$id==wav$id[wav$volume==max(wav$volume)])
      starttime <- as.POSIXct(wa$start*60*60*24,origin="0000-01-01")
      endtime <- as.POSIXct(wa$end*60*60*24,origin="0000-01-01")
      te <- corpus_subset(txt,Date>=starttime & Date<=endtime)
      t <- te[docvars(te)[,topic_areas[i]]==max(docvars(te)[,topic_areas[i]])]
      text1 <-  paste("ID:",docvars(t)$id,"\n",
                "Duration:",wa$duration,"days\n",
                "Volume:",wa$volume,"articles\n",
                "Topic:",wa$topic,"\n",                   
                "Topic area:",wa$AREA2,"\n",        
                "Topic complex:",topic_areas[i],"\n",
                "Share of topic area:",round(docvars(t)[,topic_areas[i]],3),"\n",
                "Newspaper:",docvars(t)$newspaper,"\n",
                "Publication date:",docvars(t)$date,"\n",
                "Headline:",docvars(t)$headline,"\n",             
                "Text:",paste(t),"\n",  
                "KWIC:",paste(kwic(t,myKeys,window=15)$pre,str_to_upper(kwic(t,myKeys,window=15)$keyword),kwic(t,myKeys,window=15)$pre,"\n",collapse=" "),
                "Top 10 headlines:",paste(paste(wkm[index,],"\n"),collapse=" "))}
    else text1 <- paste0("No relevant crisis news waves for this topic \n")               
    filename1 <- paste0("cr", "-",i,"-",j,".txt")
    write.csv(text1,file=filename1)
      print(paste0(i,"/",I))
      print(paste0(j,"/",J))
      flush.console()
    
    }
  }





######################
######################
# GET EXAMPLE TEXTS FOR CRISIS NEWS WAVES

  wd5 <- wavedata2
  wavedata.vol50 <- wavedata2

  W <- dim(wavedata2)[1]

  setwd(".//crisis//et")

  for (w in 1:W)
  {
    text.to.output <- wave.keywords[[w]]$texts[1:2,]
    text1 <-    paste("ID=:",(text.to.output[1,1]),"\n",
        "Share of topic:",round(text.to.output[1,2],3),"\n",
        "Keyword list:",paste(wave.keywords[[w]]$wordlist,collapse=" "),"\n",
        "Duration:",paste(wavedata.vol50[w,]$duration,collapse=" "),"\n",
        "Start date:",paste(as.POSIXct(wavedata.vol50[w,]$startdate,origin=c("1970-01-01"))),"\n",
        "End date:",paste(as.POSIXct(wavedata.vol50[w,]$enddate,origin=c("1970-01-01"))),"\n",
        "Topic:",wavedata.vol50[w,]$TOPIC,"\n",
        "Topic Complex:",wavedata.vol50[w,]$AREA,"\n",
        "Topic Area:",wavedata.vol50[w,]$AREA2,"\n",        
        "Newspaper:",docvars(cpx)$newspaper[which(docvars(cpx)$id==text.to.output[1,1])[1]],"\n",
        "Publication date:",docvars(cpx)$date[which(docvars(cpx)$id==text.to.output[1,1])[1]],"\n",
        "Headline:",docvars(cpx)$headline[which(docvars(cpx)$id==text.to.output[1,1])[1]],"\n\n\n",
        "Text:",ifelse(!is.na(which(docvars(cpx)$id==text.to.output[1,1])[1]),paste(cpx[which(docvars(cpx)$id==text.to.output[1,1])[1]],collapse=" "),"NO TEXT IDENTIFIED"),"\n\n\n",
        "KWIC:",paste(if(!is.na(which(docvars(cpx)$id==text.to.output[1,1])[1])) { paste((kwic(cpx[which(docvars(cpx)$id==text.to.output[1,1])[1]],pattern=myKeys,window=15))$pre,str_to_upper((kwic(cpx[which(docvars(cpx)$id==text.to.output[1,1])[1]],pattern=myKeys,window=15))$keyword),(kwic(cpx[which(docvars(cpx)$id==text.to.output[1,1])[1]],pattern=myKeys,window=15))$post,"\n",collapse=" ")} else paste("NO TEXT IDENTIFIED")),"\nTop 10 headlines:",paste(paste(wkm[w,],"\n"),collapse=" ") )
      
    #       ifelse(!is.na(which(docvars(tok2)$id==text.to.output[1,1])[1]),paste((kwic(tok2[which(docvars(tok2)$id==text.to.output[1,1])[1]],pattern=myKeys,window=15))$pre,str_to_upper((kwic(tok2[which(docvars(tok2)$id==text.to.output[1,1])[1]],pattern=myKeys,window=15))$keyword),(kwic(tok2[which(docvars(tok2)$id==text.to.output[1,1])[1]],pattern=myKeys,window=15))$post),"NO TEXT IDENTIFIED"))
      
    filename1 <- paste0("et",w,"---1", ".txt")
    # text2 <-  paste((text.to.output[2,1]),"\n",
    #       (text.to.output[2,2]),"\n",
    #       paste(wave.keywords[[w]]$wordlist,collapse=" "),"\n",
    #       wave.keywords[[w]]$topic,"\n",
    #       wave.keywords[[w]]$area,"\n",
    #       (textfiles.1$newspaper[which(textfiles.1$id==text.to.output[2,1])]),"\n",
    #       (textfiles.1$date[which(textfiles.1$id==text.to.output[2,1])]),"\n",
    #       (textfiles.1$headline[which(textfiles.1$id==text.to.output[2,1])]),"\n",
    #       (textfiles.1$text[which(textfiles.1$id==text.to.output[2,1])]))
    #filename2 <- paste0("et",ifelse(w>999,"0",ifelse(w>99,"00",ifelse(w>9,"000", "0000"))),w,"-2", ".txt")

    #save.csv(text1,file=filename1)

    write.csv(text1,file=filename1)

    # textdirectory1[w] <- c(text1)
    # textdirectory2[w] <- c(text2)
      print(paste0(w,"/",W))
      flush.console()
  }


  fileConn<-file("EventText1.txt")
  writeLines(textdirectory1, fileConn)
  close(fileConn)

  fileConn<-file("EventText2.txt")
  writeLines(textdirectory2, fileConn)
  close(fileConn)

4.3 Predefined dictionaries

Code
### Dictionaries from quanteda.dictionaries package
  dict_RID <- quanteda.dictionaries::data_dictionary_RID
  dict_MFD <- quanteda.dictionaries::data_dictionary_MFD
  dict_LaverGarry <- quanteda.dictionaries::data_dictionary_LaverGarry
  dict_LSD2015 <- quanteda.dictionaries::data_dictionary_LSD2015
  dict_geninqposneg <- quanteda.dictionaries::data_dictionary_geninqposneg
  dict_HuLiu <- quanteda.dictionaries::data_dictionary_HuLiu
  dict_LoughranMcDonald <- quanteda.dictionaries::data_dictionary_LoughranMcDonald
  dict_NRC <- quanteda.dictionaries::data_dictionary_NRC

### UK/US English conversion
  dict_us2uk <- quanteda.dictionaries::data_dictionary_us2uk
  dict_uk2us <- quanteda.dictionaries::data_dictionary_us2uk

### States and Geolocations
  myStates <- dictionary(list(
                        UK=         c('England','London','Liverpool','Scotland','Glasgow','Edinburgh',
              'Newcastle','Birmingham','United Kingdom', 'UK','Britain', 'Dover',
              'Belfast', 'Brighton','Sheffield', 'Southampton','Wales', 'Lancashire', 
              'Ulster','Northern Ireland' ,'Lockerbie','York','Devon'),
                        France=     c('France','Paris','FRANCE'),
                        Russia=     c('Russia','St. Peterburg','Moscow','Soviet','Chernobyl'),
                        Ireland=    c('Ireland','Dublin'),
                        Austria=    c('Austria','Vienna','VIENNA'),
                        Germany=    c('Germany','Berlin','Hamburg','Bonn','Frankfurt'),
                        India=      c('India','Dehli','Bombay'),
                        Turkey=     c('Turkey','Ankara','Istanbul','Constantinople'),
                        Egypt=      c('Egypt','Cairo','Alexandria'),
                        Italy=      c('Italy','Rome','Italian'),
                        Greece=     c('Greece','Greek','Athens'),
                        Spain=      c('SPAIN','Spain','Spanish','Madrid'),
                        Bulgaria=   c('Bulgaria','Bulgarian','Sofia'),
                        USA=        c('WASHINGTON','U.S.','United States','USA','America','American', 
              'Washington','New York','Los Angeles','Chicago','Baltimore','Boston',
              'Philadelphia','San Francisco','California','Phoenix','Texas',
              'Manhattan','Hollywood'),
                        Serbia=     c('Serbia','Serbian','Beograd*','Belgrad*','Yugoslav*'),
                        SouthAfrica=c('Rhodesia','South Africa*','Pretoria','Capetown',
              'Cape Town','Johannesburg'),
                        Hungary =   c('Hungary','Hungarian','Budapest'),
                        China=      c('China','Chinese','Beijing','Peking','Manchuria','Shanghai'),
                        Canada=     c('Canada','Canadian','Ottawa','Montreal','Toronto','Vancouver'),
                        Portugal=   c('Portugal','Portoguese','Porto','Lisbon','Lisboa'),
                        Japan=      c('Japan','Japanese','Tokio','Kyoto','Tokyo')
                        Australia=  c('Australia','Australian','Melbourne','Sydney','Canberra','Perth'),
                        Brazil=     c('Brazil','Brazilian','Brasil','Brasilian','Rio de Janeiro',
              'Sao Paulo','Brazilia','Brasilia'),
                        Belgium=    c('Belgium','Belgian','Brussels','Antwerp'),
                        Cuba=       c('Cuba','Cuban','Havana'),
                        Malta=      c('Malta','Maltes*'),
                        Gibraltar=  c('Gibraltar'),
                        CzechRepublic=c('Czechoslovakia','Czechia','Bohemia*','Czech','Moravia'),
                        Switzerland=c('Switzerland','Swiss','Helveti*','Geneva','Zurich','Bern'),
                        Argentina=  c('Argentin*','Buenos Aires'),
                        Netherlands=c('Holland','Netherlands','Dutch','Amsterdam',
              'Rotterdam','Hague'),
                        Morocco=    c('Morocc*','Rabat','Casablanca'),
                        Romania=    c('Rumania*','Romania*','Bukarest'),
                        Sweden=     c('Sweden','Swedish','Stockholm'),
                        Norway=     c('Norway','Norwegian','Oslo'),
                        Denmark=    c('Denmark','Danish','Copenhagen'),
                        Finland=    c('Finland','Finnish','Helsinki')
                        IsraelPalestine=    c('Palestine','Israel*','Palestinian',
              'Jerusalem','Tel Aviv'),
                        Macedonia=  c('Macedonia','Macedon'),
                        NewZealand= c('New Zealand','Auckland','Wellington'),
                        Iran=       c('Iran','Persia','Persian','Iranian'),
                        Pakistan=   c('Pakistan','Lahore','Islamabad'),
                        Vietnam=    c('Vietnam*','Saigon','Hanoi'),
                        Lebanon=    c('Lebanon','Lebanese','Beirut'),
                        Jordan=     c('Jordan','Jordanian'),
                        Algeria=    c('Algeria*','Algier'),
                        Syria=      c('Syria*','Damascus'),
                        Iraq=       c('Iraq*','Bagdad','Baghdad'),
                        Congo=      c('Congo*','Zaire','Kinshasa','Brazzaville',
              'Leopoldville'),
                        Singapore=  c('Singapore','Singapur*'),
                        HongKong=   c('Hongkong','Hong Kong'),
                        Kenya=      c('Kenya*','Nairobi'),
                        Zambia=     c('Zambia','Lusaka'),
                        Korea=      c('Korea','Seoul','Pyeongyang'),
                        Laos=       c('Laos','Laotian','Vientiane'),
                        Arabia=     c('Saudi','Arabia','Kuweit','Kuwait'),
                        Cyprus=     c('Cyprus','Cypriot*','Nicosia'),
                        Mexico=     c('Mexico','Mexican')
                        Kosovo=     c('Kosovo','Kosov*','Pristina'),
                        Indonesia=  c('Indonesia*','Java','Sumatra','Jakarta'),
                        Afghanistan=c('Kabul','Afghan*'),
                        Bosnia=     c('Bosnia','Sarajevo'),
                        Malaysia=   c('Malaysia','Kuala'),
                        Taiwan=     c('Taiwan','Taipeh'),
                        Thailand=   c('Thailand','Bangkok','Thai*'),
                        Ethiopia=   c('Ethiopia*','Addis Abeba'),
                        Ukraine=    c('Ukraine','Kiev','Kyiv'),
                        Libya=      c('Libya*','Tripoli*','Benghazi'),
                        Nigeria=    c('Lagos','Nigeria*'),
                        Colombia=   c('Colombia*','Bogota'),
                        Philippines=c('Philippin*','Manila','Quezon'),
                        Poland=     c('Polish','Poland','Warsaw'),
                        Zimbabwe=   c('Zimbabwe*','Harare')

### Topics
  myTopics <- dictionary(list(
    economic=c("*recession*","*econo*", "*business*","*corpor*", 
      "*enterpri*","*money*", "*product*","*sector*", "*branch*",
      "*industry*", "*trade*","*scarc*", "*profit*","*asset*", 
      "*earn*","*loss*", "*turnover*","*revenue*", "*sales*", 
      "*demand*", "*supply*","*credit*", "*inflation*","*deflation*", 
      "*currency*","*value*", "*employ*","*labor*", "*machine*", 
      "*bank*", "*financ*","oil", "*gas","*coal", "*resourc*", 
      "*iron*", "*ore","*steel*"),

    health=c("*epidemic*","*pandemic*", "*sick*","*ill*", "*plague*", 
      "*choler*", "*outbreak*","*medic*", "*physicia*","*surgeo*", 
      "*surgery*","*pharma*", "*nurse*","*doctor*", "*pox*","*variol*",
      "*pest*","*ebola*","*aids*","*hiv*","*fever*", "*cough*","*typhus*",
      "*vaccin*","*bubonic*","*influenza*", "*flu","*polio*", "*diphteria*",    
      "*anthrax*","*ehec*","*sars*","*h1n*","*h5n*","zika",
      "mers*","*legioella*","*dengue*","*covid*","*corona*","*measle*",
      "*rubella*"),

    diplomatic=c("*diploma*","*negotia*","*peace*","*war","war*",
      "*escala*","*summit*"),

        rebellion=c("*rebel*","*revolt*","*revolu*","*insurr*","*riot*",
      "*upris*","*insurg*"),
                    
    social=c("*pover*","*pauper*","*poor*","*miserab*","*hardship*",
      "*distress*","*misery*","*penury*","*impecunious*","*destitut*",
      "*indigence*","*needi*","*beggar*","*homeless*","*penniless*"),
        environmental=c("clean","*sustainab*","*preservation*", 
      "*conservation*", "*ecolog*","*greenhouse*","*climate*",
      "*warming*", "*nature*","*acid*","*pollut*","*contamin*")
        ))

### Organizations

  myBodies <- dictionary(list(
                        Government= c('White House','Government','Govern','State','Cabinet'),
                        Parliament= c('Congress','Senate','House','Parliament','Legislative'),
                        Ministry=   c('Pentagon','MAFF'),
                        Military=   c('Army'),
                        Parties=    c('Sinn Fein','Labour','Conservativ*','Liberal*')
                        Empire=     c('Empire','Imperial','',''),
                        Coalition=  c('States','Powers','Alliance','Coalition'),
                        EU=         c('EU','EEC','European Union','EC','European Community','European Parliament','European Commission','European Council','Council of the European Union'),
                        CentralBank=c('ECB','Fed','Federal Reserve','Bank of England','CBI','Monetary Policy Committee'),
                        Aircraft=   c('Boeing','Airbus'),
                        Airlines=   c('Swissair','British Airways'),
                        CarManufacturers=c('Ford'),
                        FinancialRegulators=c('FSA','IMF'),
                        UN=         c('UN','United Nations','UNO','UNHCR','UNICEF'),
                        IMF.WorldBank c('IMF','World Bank')
                        OECD=       c('OECD'),
                        NATO=       c('NATO','Nato')
                        Treasury=   c('Treasury'),
                        Equitable=  c('Equitable'),
                        Banks=      c('Bank','Morgan Stanley','HSBC','Deutsche Bank','Goldman Sachs','Merrill Lynch'),
                        Consulting= c('Ernst & Young'),
                        Polling=    c('MORI'),
                        Insurers=   c('Lloyd','TVM'),
                        Media=      c('Reuters','Independent','Times','BBC','ITV','Guardian','[tT]he Sun'),
                        Energy=     c('Enron','PG&E'),
                        Computing=  c('Microsoft'),
                        StockExchange=c('FTSE','Nasdaq','Dow Jones'),
                        Logistics=  c('P&O'),
                        NHS=        c('NHS')
            )
          )

### Groups
  myGroups <- dictionary(list(
                        Boers=      c('Boers','Imperial','',''),
                        Tory=   c('Tory','Tories','Conservative','Conservatives'),
                        Liberals=c('Liberal','Liberals','Whig','Whigs'),
                        Labour=c('Labour Party','Labour'),
                        TerrorGroups=c('[Aa]l Qaeda')
            )
            )

### Cities
  myCities <- dictionary(list(
                        London=     c('London','LONDON'),
                        Paris=      c('Paris','PARIS'),
                        Vienna=     c('Vienna','VIENNA'),
                        NewYork=    c('New York','NEW YORK','Manhattan','World Trade Centre'),
                        Berlin=     c('Berlin'),
                        Liverpool=  c('Liverpool'),
                        Athens=     c('Athens'),
                        Bombay=     c('Bombay'),
                        Lisbon=     c('Lisbon','Lisboa'),
                        Porto=      c('Porto'),
                        Sofia=      c('Sofia'),
                        Pretoria=   c('Pretoria'),
                        Tokio=      c('Tokio','Tokyo'),
                        Naples=     c('Naples'),
                        Edinburgh=  c('Edinburgh'),
                        Glasgow=    c('Glasgow'),
                        Newcastle=  c('Newcastle'),
                        Birmingham= c('Birmingham'),
                        Washington= c('Washington'),
                        Chicago=    c('Chicago'),
                        CapeTown=   c('Cape Town'),
                        Dover=      c('Dover'),
                        Brighton=   c('Brighton'),
                        Belfast=    c('Belfast'),
                        Hamburg=    c('Hamburg'),
                        Johannesburg=c('Johannesburg'),
                        Sheffield=  c('Sheffield'),
                        Philadelphia=c('Philadelphia'),
                        Southampton=c('Southampton'),
                        Chernobyl=c('Chernobyl'),
                        Lockerbie=c('Lockerbie'),
                        LosAngeles=c('Los Angeles','Hollywood')
            )

4.4 Latent Semantic Scaling Dictionaries

4.4.1 Newsgathering

Code
vapply(nc.tok, paste, FUN.VALUE = character(1), collapse = " ") %>%
  corpus() -> nc.cp
docvars(nc.cp) <- docvars(nc.tok)
docnames(nc.cp) <- paste0("nc_", docnames(nc.cp))

vapply(tms_tk_su, paste, FUN.VALUE = character(1), collapse = " ") %>%
  corpus() -> cc.cp
docvars(cc.cp) <- docvars(tms_tk_su)[-17]
docnames(cc.cp) <- paste0("cc_", docnames(cc.cp))

t.cp <- c(cc.cp, nc.cp)

lsx_dtm <- dfm_trim(t.dtm2, min_docfreq = 100, docfreq_type = "count")


news_dict <- dictionary(list(
  publish = c("publisb*", "publish*", "publication*", "copie*", "copy*", "document"),
  print = c("papers", "journal", "newspapers", "newspaper", "print*", "gazette*", "column*", "page*", "weekly", "weeklie*", "magazin*", "book*", "memoir*"),
  journalist = c("correspondent*", "reporter*", "author*", "writer*", "editor*", "journalist*", "photographer*", "corresoondent", "presenter*"),
  production = c("write", "writing", "written", "wrote", "insert*", "text*", "articl*", "paragraph*", "film*", "story*"),
  content = c("news", "contain*", "extract*", "intelligence*", "information*", "describ*", "descrip*", "summary", "report", "reports", "newvs", "headline*", "coverage*", "pictur*", "photograph", "photographs", "preview", "word*", "video*", "meme*", "post*"),
  topic = c("concerning*", "refer*", "regard*", "sport*", "hashtag*"),
  genre = c("editorial", "obituary", "chronicl*", "feature*"),
  quality = c("interesting", "sensational", "classified", "reveal*", "trending*", "viral*"),
  credibility = c("authent*", "official*", "semi-official*", "signed*", "quot*", "rumour*", "anonymous*", "reassur*", "trustworth*", "distort*", "accuracy", "damning", "paraphras*"),
  statement = c("announc*", "say*", "stating", "statement*", "confirm*", "mention*", "messag*", "accord*", "emanat*", "edict*", "brief*", "talk*", "interview*", "allegation*", "criticism*", "criticise*", "inquiry"),
  paper_material = c("letter*", "receipt*", "handout*", "dossier*"),
  broadcast_material = c("newsreel*"),
  online_material = c("email", "e-mail", "hotmail"),
  discourse = c("reply", "respon*", "comment*", "correspondenc*"),
  transmission = c("transmiss*", "transmit*", "forwarded", "communicat*", "broadcast*", "enclose*", "messeng*", "send*", "bulletin*", "dissem*", "record*"),
  transmission_postal = c("dispatch*", "despatch*", ""),
  transmission_telegram = c("telegraph*", "telegram*", "relay*", "cable*"),
  transmission_broadcast = c("broadcast*", "newsreel*", "wireless", "radio*", "wvireless*", "televis*", "tv", "fm", "stereo", "channel*", "mhz", "khz", "programme*", "terrestrial*", "satellite*"),
  transmission_telephone = c("phone", "telephone", "call", "sms"),
  transmission_internet = c("broadcast*", "newsreel*", "digital", "internet", "web", "broadband", "websit*", "online", "skype", "multimedia", "fibre-optic*", "server*", "gadget*", "laptop", "app", "platform*", "upload*", "download*", "algorithm*", "curat*"),
  relation = c("reproduc*"),
  censorship = c("censo*"),
  audience = c("listener*", "audience*", "reader*", "public", "viewer*", "user*", "follower*"),
  paid = c("advertis*"),
  time = c("dated", "latest", "sept", "oct", "junx", "lately", "today*", "to-day*", "tonight*", "to-night*", "daily", "daili*"),
  institution = c("journalism*", "media"),
  institution_agency = c("havas", "agency", "agency:-", "bureau*", "reuters", "pergamon", "maxwells", "associated", "agenci*"),
  institution_newspapers = c("presse", "figaro", "blatt", "herald", "nachrichten"),
  institution_broadcast = c("bbc", "b.b.c", "b.b.c.", "itv", "bbc1", "morgenmagazin", "mittagsmagazin", "heute"),
  institution_socialmedia = c("google", "facebook", "instagram", "twitter", "youtube", "snapchat", "whatsapp", "tiktok"),
  self = c("tinmes"),
  regulation = c("ofcom", "licence"),
  music = c("music", "jazz", "classical*", "pop", "tune*", "sing*", "recital*", "orchestra*", "concert*", "rock", "chart*", "song*", "joplin", "philharmonia", "symphonic"),
  network = c("network*", "social*", "interaction*")
))

news_dict_r2 <- dictionary(list(
  publish = c("publisb*", "publish*", "publication*", "copie*", "copy*", "document", "ublish*", "authior*"),
  print = c("papers", "journal", "newspapers", "newspaper", "print*", "gazette*", "column*", "page*", "weekly", "weeklie*", "magazin*", "book*", "memoir*", "dossier*", "letter*", "receipt*", "handout*", "dossier*"),
  journalist = c("correspondent*", "reporter*", "author*", "writer*", "editor*", "journalist*", "photographer*", "corresoondent", "presenter*", "columnist*"),
  production = c("write", "writing", "written", "wrote", "insert*", "text*", "articl*", "paragraph*", "film*", "story*", "camera*", "newsbeat*", "newsdesk*"),
  content = c("news", "contain*", "extract*", "intelligence*", "information*", "describ*", "descrip*", "summary", "report", "reports", "newvs", "headline*", "coverage*", "pictur*", "photograph", "photographs", "preview", "word*", "video*", "meme*", "post*", "headline*", "info*"),
  topic = c("concerning*", "refer*", "regard*", "sport*", "hashtag*"),
  genre = c("editorial", "obituar*", "chronicl*", "feature*", "interview*", "commentar*", "antholog*", "kaleidoscop*"),
  quality = c("interesting", "sensational", "classified", "reveal*", "trending*", "viral*", "investigativ*", "exciting*", "astonishing*"),
  credibility = c("authent*", "official*", "semi-official*", "signed*", "quot*", "rumour*", "anonymous*", "reassur*", "trustworth*", "distort*", "accuracy", "damning", "paraphras*", "confirmed", "unconfirmed", "well-informed*", "wellinformed", "contradict*", "doubtless*", "credence*", "authoritative*", "apparent*", "persist*", "alleged", "genuin*"),
  statement = c("announc*", "say*", "stating", "statement*", "confirm*", "mention*", "messag*", "accord*", "emanat*", "edict*", "brief*", "talk*", "interview*", "criticism*", "criticise*", "briefing*", "justif*", "complain*"),
  discourse = c("reply", "respon*", "comment*", "correspondenc*", "answer*", "unanswered"),
  transmission = c("dissemina*", "transmiss*", "transmit*", "forwarded", "communicat*", "broadcast*", "enclose*", "messeng*", "send*", "bulletin*", "dissem*", "record*"),
  transmission_postal = c("dispatch*", "despatch*"),
  transmission_telegram = c("telegraph*", "telegram*", "relay*", "cable*"),
  transmission_broadcast = c("newsreel*", "broadcast*", "newsreel*", "wireless", "radio*", "wvireless*", "televis*", "tv", "fm", "stereo", "channel*", "mhz", "khz", "programme*", "terrestrial*", "satellite*", "microphon*"),
  transmission_telephone = c("nokia", "vodafone", "carphone", "blackberry", "phone", "telephone", "call", "sms", "iphone*", "ipad*", "telex*", "cellular*", "landline*", "phonc", "smartphon*"),
  transmission_internet = c("broadcast*", "newsreel*", "digital", "internet", "web", "broadband", "websit*", "online", "skype", "multimedia", "fibre-optic*", "server*", "gadget*", "laptop", "app", "platform*", "upload*", "download*", "algorithm*", "curat*", "e-mail*", "email", "e-mail", "hotmail"),
  censorship = c("censo*"),
  audience = c("listener*", "audience*", "reader*", "public", "viewer*", "user*", "follower*", "spectator*"),
  paid = c("advertis*", "adverts"),
  time = c("dated", "latest", "sept", "oct", "junx", "lately", "today*", "to-day*", "tonight*", "to-night*", "daily", "daili*"),
  institution = c("journalism*", "media", "press"),
  institution_agency = c("havas", "agency", "agency:-", "bureau*", "reuters", "pergamon", "maxwells", "associated", "agenci*"),
  institution_newspapers = c("presse", "figaro", "blatt", "herald", "nachrichten", "tabloid*", "gazette*", "frontpage*", "front-page*"),
  institution_broadcast = c("bskyb*", "bbc*", "b.b.c*", "b.b.c.*", "itv*", "bbc1*", "morgenmagazin", "mittagsmagazin", "heute", "television"),
  institution_socialmedia = c("facebook", "instagram", "twitter", "snapchat", "whatsapp", "tiktok"),
  institution_internet = c("google", "internet", "www", "world-wide-web", "worldwideweb", "youtube", "ebay"),
  self = c("tinmes", "timess*", "times2*", "thetimes.co.uk"),
  regulation = c("ofcom", "licence", "overseen*"),
  music = c("music", "jazz", "classical*", "pop", "tune*", "sing*", "recital*", "orchestra*", "concert*", "rock", "chart*", "song*", "joplin", "philharmonia", "symphonic"),
  network = c("network*", "social*", "interaction*"),
  publisher = c("hurst", "murdoch"),
  scandal = c("allegation*", "inquiry", "denie*", "deny*", "apologis*", "disclosur*", "whistleblow*", "blame*", "scandal*", "affair*", "wrongdoing*", "scath*", "accus*", "sack*", "revelation*", "culpabilit*", "regret*", "excus*")
))

news_dict_r2g <- dictionary(list(news = c("publisb*", "publish*", "publication*", "copie*", "copy*", "document", "ublish*", "authior*", "papers", "journal", "newspapers", "newspaper", "print*", "gazette*", "column*", "page*", "weekly", "weeklie*", "magazin*", "book*", "memoir*", "dossier*", "letter*", "receipt*", "handout*", "dossier*", "correspondent*", "reporter*", "author*", "writer*", "editor*", "journalist*", "photographer*", "corresoondent", "presenter*", "columnist*", "write", "writing", "written", "wrote", "insert*", "text*", "articl*", "paragraph*", "film*", "story*", "camera*", "newsbeat*", "newsdesk*", "news", "contain*", "extract*", "intelligence*", "information*", "describ*", "descrip*", "summary", "report", "reports", "newvs", "headline*", "coverage*", "pictur*", "photograph", "photographs", "preview", "word*", "video*", "meme*", "post*", "headline*", "info*", "concerning*", "refer*", "regard*", "sport*", "hashtag*", "editorial", "obituar*", "chronicl*", "feature*", "interview*", "commentar*", "antholog*", "kaleidoscop*", "interesting", "sensational", "classified", "reveal*", "trending*", "viral*", "investigativ*", "exciting*", "astonishing*", "authent*", "official*", "semi-official*", "signed*", "quot*", "rumour*", "anonymous*", "reassur*", "trustworth*", "distort*", "accuracy", "damning", "paraphras*", "confirmed", "unconfirmed", "well-informed*", "wellinformed", "contradict*", "doubtless*", "credence*", "authoritative*", "apparent*", "persist*", "alleged", "genuin*", "announc*", "say*", "stating", "statement*", "confirm*", "mention*", "messag*", "accord*", "emanat*", "edict*", "brief*", "talk*", "interview*", "criticism*", "criticise*", "briefing*", "justif*", "complain*", "reply", "respon*", "comment*", "correspondenc*", "answer*", "unanswered", "dissemina*", "transmiss*", "transmit*", "forwarded", "communicat*", "broadcast*", "enclose*", "messeng*", "send*", "bulletin*", "dissem*", "record*", "dispatch*", "despatch*", "telegraph*", "telegram*", "relay*", "cable*", "newsreel*", "broadcast*", "newsreel*", "wireless", "radio*", "wvireless*", "televis*", "tv", "fm", "stereo", "channel*", "mhz", "khz", "programme*", "terrestrial*", "satellite*", "microphon*", "nokia", "vodafone", "carphone", "blackberry", "phone", "telephone", "call", "sms", "iphone*", "ipad*", "telex*", "cellular*", "landline*", "phonc", "smartphon*", "broadcast*", "newsreel*", "digital", "internet", "web", "broadband", "websit*", "online", "skype", "multimedia", "fibre-optic*", "server*", "gadget*", "laptop", "app", "platform*", "upload*", "download*", "algorithm*", "curat*", "e-mail*", "email", "e-mail", "hotmail", "censo*", "listener*", "audience*", "reader*", "public", "viewer*", "user*", "follower*", "spectator*", "advertis*", "adverts", "dated", "latest", "sept", "oct", "junx", "lately", "today*", "to-day*", "tonight*", "to-night*", "daily", "daili*", "journalism*", "media", "press", "havas", "agency", "agency:-", "bureau*", "reuters", "pergamon", "maxwells", "associated", "agenci*", "presse", "figaro", "blatt", "herald", "nachrichten", "tabloid*", "gazette*", "frontpage*", "front-page*", "bskyb*", "bbc*", "b.b.c*", "b.b.c.*", "itv*", "bbc1*", "morgenmagazin", "mittagsmagazin", "heute", "television", "facebook", "instagram", "twitter", "snapchat", "whatsapp", "tiktok", "google", "internet", "www", "world-wide-web", "worldwideweb", "youtube", "ebay", "tinmes", "timess*", "times2*", "thetimes.co.uk", "ofcom", "licence", "overseen*", "music", "jazz", "classical*", "pop", "tune*", "sing*", "recital*", "orchestra*", "concert*", "rock", "chart*", "song*", "joplin", "philharmonia", "symphonic", "network*", "social*", "interaction*", "hurst", "murdoch", "allegation*", "inquiry", "denie*", "deny*", "apologis*", "disclosur*", "whistleblow*", "blame*", "scandal*", "affair*", "wrongdoing*", "scath*", "accus*", "sack*", "revelation*", "culpabilit*", "regret*", "excus*")))

publish.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["publish"]])
print.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["print"]])
journalist.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["journalist"]])
production.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["production"]])
content.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["content"]])
topic.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["topic"]])
genre.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["genre"]])
quality.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["quality"]])
credibility.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["credibility"]])
statement.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["statement"]])
paper_material.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["paper_material"]])
broadcast_material.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["broadcast_material"]])
online_material.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["online_material"]])
discourse.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["discourse"]])
transmission.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["transmission"]])
transmission_postal.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["transmission_postal"]])
transmission_telegram.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["transmission_telegram"]])
transmission_broadcast.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["transmission_broadcast"]])
transmission_telephone.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["transmission_telephone"]])
transmission_internet.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["transmission_internet"]])
relation.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["relation"]])
censorship.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["censorship"]])
audience.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["audience"]])
paid.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["paid"]])
time.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["time"]])
institution.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["institution"]])
institution_agency.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["institution_agency"]])
institution_newspapers.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["institution_newspapers"]])
institution_broadcast.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["institution_broadcast"]])
institution_socialmedia.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["institution_socialmedia"]])
self.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["self"]])
regulation.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["regulation"]])
music.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["music"]])
network.words <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict[["network"]])

publish.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["publish"]])
print.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["print"]])
journalist.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["journalist"]])
production.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["production"]])
content.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["content"]])
topic.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["topic"]])
genre.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["genre"]])
quality.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["quality"]])
credibility.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["credibility"]])
statement.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["statement"]])
discourse.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["discourse"]])
transmission.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["transmission"]])
transmission_postal.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["transmission_postal"]])
transmission_telegram.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["transmission_telegram"]])
transmission_broadcast.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["transmission_broadcast"]])
transmission_telephone.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["transmission_telephone"]])
transmission_internet.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["transmission_internet"]])
relation.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["relation"]])
censorship.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["censorship"]])
audience.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["audience"]])
paid.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["paid"]])
time.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["time"]])
institution.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["institution"]])
institution_agency.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["institution_agency"]])
institution_newspapers.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["institution_newspapers"]])
institution_broadcast.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["institution_broadcast"]])
institution_socialmedia.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["institution_socialmedia"]])
self.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["self"]])
regulation.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["regulation"]])
music.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["music"]])
network.words_r2 <- LSX::textmodel_lss(x = lsx_dtm, seeds = news_dict_r2[["network"]])


news_dict <- dictionary(list(
  publish = c("publisb*", "publish*", "publication*", "copie*", "copy*", "document"),
  print = c("papers", "journal", "newspapers", "newspaper", "print*", "gazette*", "column*", "page*", "weekly", "weeklie*", "magazin*", "book*", "memoir*"),
  journalist = c("correspondent*", "reporter*", "author*", "writer*", "editor*", "journalist*", "photographer*", "corresoondent", "presenter*"),
  production = c("write", "writing", "written", "wrote", "insert*", "text*", "articl*", "paragraph*", "film*", "story*"),
  content = c("news", "contain*", "extract*", "intelligence*", "information*", "describ*", "descrip*", "summary", "report", "reports", "newvs", "headline*", "coverage*", "pictur*", "photograph", "photographs", "preview", "word*", "video*", "meme*", "post*"),
  topic = c("concerning*", "refer*", "regard*", "sport*", "hashtag*"),
  genre = c("editorial", "obituary", "chronicl*", "feature*"),
  quality = c("interesting", "sensational", "classified", "reveal*", "trending*", "viral*"),
  credibility = c("authent*", "official*", "semi-official*", "signed*", "quot*", "rumour*", "anonymous*", "reassur*", "trustworth*", "distort*", "accuracy", "damning", "paraphras*"),
  statement = c("announc*", "say*", "stating", "statement*", "confirm*", "mention*", "messag*", "accord*", "emanat*", "edict*", "brief*", "talk*", "interview*", "allegation*", "criticism*", "criticise*", "inquiry"),
  paper_material = c("letter*", "receipt*", "handout*", "dossier*"),
  broadcast_material = c("newsreel*"),
  online_material = c("email", "e-mail", "hotmail"),
  discourse = c("reply", "respon*", "comment*", "correspondenc*"),
  transmission = c("transmiss*", "transmit*", "forwarded", "communicat*", "broadcast*", "enclose*", "messeng*", "send*", "bulletin*", "dissem*", "record*"),
  transmission_postal = c("dispatch*", "despatch*", ""),
  transmission_telegram = c("telegraph*", "telegram*", "relay*", "cable*"),
  transmission_broadcast = c("broadcast*", "newsreel*", "wireless", "radio*", "wvireless*", "televis*", "tv", "fm", "stereo", "channel*", "mhz", "khz", "programme*", "terrestrial*", "satellite*"),
  transmission_telephone = c("phone", "telephone", "call", "sms"),
  transmission_internet = c("broadcast*", "newsreel*", "digital", "internet", "web", "broadband", "websit*", "online", "skype", "multimedia", "fibre-optic*", "server*", "gadget*", "laptop", "app", "platform*", "upload*", "download*", "algorithm*", "curat*"),
  relation = c("reproduc*"),
  censorship = c("censo*"),
  audience = c("listener*", "audience*", "reader*", "public", "viewer*", "user*", "follower*"),
  paid = c("advertis*"),
  time = c("dated", "latest", "sept", "oct", "junx", "lately", "today*", "to-day*", "tonight*", "to-night*", "daily", "daili*"),
  institution = c("journalism*", "media"),
  institution_agency = c("havas", "agency", "agency:-", "bureau*", "reuters", "pergamon", "maxwells", "associated", "agenci*"),
  institution_newspapers = c("presse", "figaro", "blatt", "herald", "nachrichten"),
  institution_broadcast = c("bbc", "b.b.c", "b.b.c.", "itv", "bbc1", "morgenmagazin", "mittagsmagazin", "heute"),
  institution_socialmedia = c("google", "facebook", "instagram", "twitter", "youtube", "snapchat", "whatsapp", "tiktok"),
  self = c("tinmes"),
  regulation = c("ofcom", "licence"),
  music = c("music", "jazz", "classical*", "pop", "tune*", "sing*", "recital*", "orchestra*", "concert*", "rock", "chart*", "song*", "joplin", "philharmonia", "symphonic"),
  network = c("network*", "social*", "interaction*")
))

dfml_news <- dfm_lookup(t.dtm2, dictionary = news_dict, nomatch = "NOMATCH")
dfml_news_r2 <- dfm_lookup(t.dtm2, dictionary = news_dict_r2, nomatch = "NOMATCH")
dfml_news_r2g <- dfm_lookup(t.dtm2, dictionary = news_dict_r2g, nomatch = "NOMATCH")

docvars(t.dtm2)[, names(news_dict_r2)] <- dfml_news_r2[, names(news_dict_r2)]
docvars(t.dtm2)[, "news_total"] <- dfml_news_r2$news

docvars(t.dtm2)$publish_rfreq <- 1000 * docvars(t.dtm2)$publish / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$print_rfreq <- 1000 * docvars(t.dtm2)$print / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$journalist_rfreq <- 1000 * docvars(t.dtm2)$journalist / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$production_rfreq <- 1000 * docvars(t.dtm2)$production / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$content_rfreq <- 1000 * docvars(t.dtm2)$content / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$topic_rfreq <- 1000 * docvars(t.dtm2)$topic / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$quality_rfreq <- 1000 * docvars(t.dtm2)$quality / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$credibility_rfreq <- 1000 * docvars(t.dtm2)$credibility / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$statement_rfreq <- 1000 * docvars(t.dtm2)$statement / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$genre_rfreq <- 1000 * docvars(t.dtm2)$genre / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$paper_material_rfreq <- 1000 * docvars(t.dtm2)$paper_material / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$broadcast_material_rfreq <- 1000 * docvars(t.dtm2)$broadcast_material / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$online_material_rfreq <- 1000 * docvars(t.dtm2)$online_material / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$discourse_rfreq <- 1000 * docvars(t.dtm2)$discourse / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$transmission_rfreq <- 1000 * docvars(t.dtm2)$transmission / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$transmission_postal_rfreq <- 1000 * docvars(t.dtm2)$transmission_postal / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$transmission_telegram_rfreq <- 1000 * docvars(t.dtm2)$transmission_telegram / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$transmission_broadcast_rfreq <- 1000 * docvars(t.dtm2)$transmission_broadcast / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$transmission_telephone_rfreq <- 1000 * docvars(t.dtm2)$transmission_telephone / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$transmission_internet_rfreq <- 1000 * docvars(t.dtm2)$transmission_internet / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$censorship_rfreq <- 1000 * docvars(t.dtm2)$censorship / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$audience_rfreq <- 1000 * docvars(t.dtm2)$audience / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$paid_rfreq <- 1000 * docvars(t.dtm2)$paid / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$institution_rfreq <- 1000 * docvars(t.dtm2)$institution / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$institution_agency_rfreq <- 1000 * docvars(t.dtm2)$institution_agency / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$institution_newspapers_rfreq <- 1000 * docvars(t.dtm2)$institution_newspapers / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$institution_broadcasth_rfreq <- 1000 * docvars(t.dtm2)$institution_broadcast / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$institution_socialmedia_rfreq <- 1000 * docvars(t.dtm2)$institution_socialmedia / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$regulation_rfreq <- 1000 * docvars(t.dtm2)$regulation / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$music_rfreq <- 1000 * docvars(t.dtm2)$music / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$network_rfreq <- 1000 * docvars(t.dtm2)$network / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$news_rfreq <- 1000 * docvars(t.dtm2)$news / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$time_rfreq <- 1000 * docvars(t.dtm2)$time / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$scandal_rfreq <- 1000 * docvars(t.dtm2)$scandal / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$publisher_rfreq <- 1000 * docvars(t.dtm2)$publisher / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$institution_internet_rfreq <- 1000 * docvars(t.dtm2)$institution_internet / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$self_rfreq <- 1000 * docvars(t.dtm2)$self / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$news_total_rfreq <- 1000 * docvars(t.dtm2)$news_total / docvars(t.dtm2)$TOTALWORD

docvars(t.dtm2)$publish_rrfreq <- 1000 * docvars(t.dtm2)$publish / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$print_rrfreq <- 1000 * docvars(t.dtm2)$print / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$journalist_rrfreq <- 1000 * docvars(t.dtm2)$journalist / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$production_rrfreq <- 1000 * docvars(t.dtm2)$production / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$content_rrfreq <- 1000 * docvars(t.dtm2)$content / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$topic_rrfreq <- 1000 * docvars(t.dtm2)$topic / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$quality_rrfreq <- 1000 * docvars(t.dtm2)$quality / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$credibility_rrfreq <- 1000 * docvars(t.dtm2)$credibility / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$statement_rrfreq <- 1000 * docvars(t.dtm2)$statement / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$genre_rrfreq <- 1000 * docvars(t.dtm2)$genre / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$paper_material_rrfreq <- 1000 * docvars(t.dtm2)$paper_material / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$broadcast_material_rrfreq <- 1000 * docvars(t.dtm2)$broadcast_material / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$online_material_rrfreq <- 1000 * docvars(t.dtm2)$online_material / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$discourse_rrfreq <- 1000 * docvars(t.dtm2)$discourse / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$transmission_rrfreq <- 1000 * docvars(t.dtm2)$transmission / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$transmission_postal_rrfreq <- 1000 * docvars(t.dtm2)$transmission_postal / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$transmission_telegram_rrfreq <- 1000 * docvars(t.dtm2)$transmission_telegram / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$transmission_broadcast_rrfreq <- 1000 * docvars(t.dtm2)$transmission_broadcast / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$transmission_telephone_rrfreq <- 1000 * docvars(t.dtm2)$transmission_telephone / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$transmission_internet_rrfreq <- 1000 * docvars(t.dtm2)$transmission_internet / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$censorship_rrfreq <- 1000 * docvars(t.dtm2)$censorship / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$audience_rrfreq <- 1000 * docvars(t.dtm2)$audience / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$paid_rrfreq <- 1000 * docvars(t.dtm2)$paid / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$institution_rrfreq <- 1000 * docvars(t.dtm2)$institution / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$institution_agency_rrfreq <- 1000 * docvars(t.dtm2)$institution_agency / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$institution_newspapers_rrfreq <- 1000 * docvars(t.dtm2)$institution_newspapers / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$institution_broadcasth_rrfreq <- 1000 * docvars(t.dtm2)$institution_broadcast / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$institution_socialmedia_rrfreq <- 1000 * docvars(t.dtm2)$institution_socialmedia / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$regulation_rrfreq <- 1000 * docvars(t.dtm2)$regulation / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$music_rrfreq <- 1000 * docvars(t.dtm2)$music / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$network_rrfreq <- 1000 * docvars(t.dtm2)$network / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$news_rrfreq <- 1000 * docvars(t.dtm2)$news / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$time_rrfreq <- 1000 * docvars(t.dtm2)$time / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$scandal_rrfreq <- 1000 * docvars(t.dtm2)$scandal / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$publisher_rrfreq <- 1000 * docvars(t.dtm2)$publisher / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$institution_internet_rrfreq <- 1000 * docvars(t.dtm2)$institution_internet / (docvars(t.dtm2)$news_total + 1)
docvars(t.dtm2)$self_rrfreq <- 1000 * docvars(t.dtm2)$self / (docvars(t.dtm2)$news_total + 1)

media_rfreq <- paste0(names(news_dict_r2), "_rfreq")
media_rrfreq <- paste0(names(news_dict_r2), "_rrfreq")

lmedia1 <- pivot_longer(docvars(t.dtm2), cols = c(media_rfreq[-25]))
lmedia2 <- pivot_longer(docvars(t.dtm2), cols = c(media_rrfreq[-25]))

lmedia1 %>%
  group_by(year, name) %>%
  summarise(share = weighted.mean(x = value, w = TOTALWORD, na.rm = TRUE)) -> lymedia1

lmedia2 %>%
  group_by(year, name) %>%
  summarise(share = weighted.mean(x = value, w = TOTALWORD, na.rm = TRUE)) -> lymedia2

ggplot(lymedia1, aes(y = share, x = as.numeric(year))) +
  geom_smooth(span = .25) +
  geom_point(shape = ".") +
  facet_wrap(. ~ name, scale = "free_y")

ggplot(lymedia2, aes(y = share, x = as.numeric(year))) +
  geom_smooth(span = .25) +
  geom_point(shape = ".") +
  facet_wrap(. ~ name, scale = "free_y")

ggplot(subset(lymedia2, str_detect(lymedia2$name, "transmission")), aes(y = share, color = name, fill = name, x = as.numeric(year))) +
  geom_smooth(span = .25) +
  geom_point(shape = ".")

4.4.2 Media Populism and People-References

Code
vapply(nc.tok, paste, FUN.VALUE = character(1), collapse = " ") %>%
  corpus() -> nc.cp
docvars(nc.cp) <- docvars(nc.tok)
docnames(nc.cp) <- paste0("nc_", docnames(nc.cp))

vapply(tms_tk_su, paste, FUN.VALUE = character(1), collapse = " ") %>%
  corpus() -> cc.cp
docvars(cc.cp) <- docvars(tms_tk_su)[-17]
docnames(cc.cp) <- paste0("cc_", docnames(cc.cp))

t.cp <- corpus(c(cc.cp, nc.cp))
dfm_lookup(t.dtm2, dictionary = news_xwl) -> dfmlo_news
dfm_lookup(t.dtm2, dictionary = news_dict) -> dfmlo_newsdict

dfm_lookup(t.dtm2, dictionary = data_dictionary_LaverGarry) -> dfmlo_LG
dfm_lookup(t.dtm2, dictionary = data_dictionary_NRC) -> dfmlo_NRC
dfm_lookup(t.dtm2, dictionary = data_dictionary_MFD) -> dfmlo_MFD
dfm_lookup(t.dtm2, dictionary = data_dictionary_RID) -> dfmlo_RID

docvars(t.dtm2)[, "news"] <- as.numeric(dfmlo_news[, 1])
docvars(t.dtm2)[, colnames(dfmlo_newsdict)] <- dfmlo_newsdict
docvars(t.dtm2)[, colnames(dfmlo_LG)] <- dfmlo_LG
docvars(t.dtm2)[, colnames(dfmlo_NRC)] <- dfmlo_NRC
docvars(t.dtm2)[, colnames(dfmlo_MFD)] <- dfmlo_MFD
docvars(t.dtm2)[, colnames(dfmlo_RID)] <- dfmlo_RID

textstat_polarity(t.dtm2, dictionary = data_dictionary_LSD2015) -> t.sentiment1
textstat_polarity(t.dtm2, dictionary = data_dictionary_geninqposneg) -> t.sentiment2
textstat_polarity(t.dtm2, dictionary = data_dictionary_HuLiu) -> t.sentiment3
textstat_polarity(t.dtm2, dictionary = data_dictionary_LoughranMcDonald) -> t.sentiment4
textstat_polarity(t.dtm2, dictionary = data_dictionary_NRC) -> t.sentiment5

psych::alpha(cbind(t.sentiment1$sentiment, t.sentiment2$sentiment, t.sentiment3$sentiment, t.sentiment4$sentiment, t.sentiment5$sentiment))
docvars(t.dtm2)$sscore <- psych::scoreItems(keys = c(1, 1, 1, 1, 1), items = cbind(t.sentiment1$sentiment, t.sentiment2$sentiment, t.sentiment3$sentiment, t.sentiment4$sentiment, t.sentiment5$sentiment))$scores

with(docvars(t.dtm2), tapply(sscore, subcorpus05, mean, na.rm = TRUE))
with(docvars(t.dtm2), tapply(sscore, DIS > 0.1, mean, na.rm = TRUE))
with(docvars(t.dtm2), tapply(sscore, POL > 0.1, mean, na.rm = TRUE))
with(docvars(t.dtm2), tapply(sscore, GEO > 0.1, mean, na.rm = TRUE))
with(docvars(t.dtm2), tapply(sscore, ECO > 0.1, mean, na.rm = TRUE))
with(docvars(t.dtm2), tapply(sscore, EPI > 0.1, mean, na.rm = TRUE))
with(docvars(t.dtm2), tapply(sscore, DOM > 0.1, mean, na.rm = TRUE))
with(docvars(t.dtm2), tapply(sscore, WEL > 0.1, mean, na.rm = TRUE))

docvars(t.dtm2)$Decade <- 10 * floor(as.numeric(docvars(t.dtm2)$year) / 10)
docvars(t.dtm2)$Period <- Recode(as.numeric(docvars(t.dtm2)$year), "1785:1899='1785-1899';1900:1949='1900-1949';1950:1999='1950-1999';2000:2020='2000-2020'")

docvars(t.dtm2)$Decade_f <- factor(10 * floor(as.numeric(docvars(t.dtm2)$year) / 10))
docvars(t.dtm2)$Period_f <- factor(Recode(as.numeric(docvars(t.dtm2)$year), "1785:1849='1785-1849';1850:1899='1850-1899';1900:1949='1900-1949';1950:1999='1950-1999';2000:2020='2000-2020'"))


docvars(t.dtm2)$noncrisis <- as.numeric(t.dtm2$subcorpus05 == "routine")
docvars(t.dtm2)$crisis <- as.numeric(t.dtm2$subcorpus05 == "crisis rhetoric")
docvars(t.dtm2)$wave <- as.numeric(t.dtm2$subcorpus05 == "crisis news wave")
docvars(t.dtm2)$PEOPLEWORDS_hi <- as.numeric(t.dtm2$PEOPLEWORDS_rfreq > 10)
docvars(t.dtm2)$DIS_hi <- as.numeric(t.dtm2$DIS > 0.1)
docvars(t.dtm2)$POL_hi <- as.numeric(t.dtm2$POL > 0.1)
docvars(t.dtm2)$ECO_hi <- as.numeric(t.dtm2$ECO > 0.1)
docvars(t.dtm2)$EPI_hi <- as.numeric(t.dtm2$EPI > 0.1)
docvars(t.dtm2)$DOM_hi <- as.numeric(t.dtm2$DOM > 0.1)
docvars(t.dtm2)$GEO_hi <- as.numeric(t.dtm2$GEO > 0.1)

aov_news_time0 <- lm(news ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_publish_time0 <- lm(publish ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_print_time0 <- lm(print ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_journalist_time0 <- lm(journalist ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_production_time0 <- lm(production ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_content_time0 <- lm(content ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_topic_time0 <- lm(topic ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_quality_time0 <- lm(quality ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_credibility_time0 <- lm(credibility ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_statement_time0 <- lm(statement ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_paper_material_time0 <- lm(paper_material ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_broadcast_material_time0 <- lm(broadcast_material ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_online_material_time0 <- lm(online_material ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_discourse_time0 <- lm(discourse ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_transmission_time0 <- lm(transmission ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_transmission_postal_time0 <- lm(transmission_postal ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_transmission_telegram_time0 <- lm(transmission_telegram ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_transmission_broadcast_time0 <- lm(transmission_broadcast ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_transmission_telephone_time0 <- lm(transmission_telephone ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_transmission_internet_time0 <- lm(transmission_internet ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_censorship_time0 <- lm(censorship ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_audience_time0 <- lm(audience ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_paid_time0 <- lm(paid ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_institution_time0 <- lm(institution ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_institution_agency_time0 <- lm(institution_agency ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_institution_newspapers_time0 <- lm(institution_newspapers ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_institution_broadcast_time0 <- lm(institution_broadcast ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_institution_socialmedia_time0 <- lm(institution_socialmedia ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_regulation_time0 <- lm(regulation ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_music_time0 <- lm(music ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))
aov_network_time0 <- lm(network ~ (Period_f) * (crisis + wave) + EPI_hi + DOM_hi + ECO_hi + GEO_hi + POL_hi + DIS_hi, data = docvars(t.dtm2))



df.news.period <- data.frame(Effect(aov_news_time0, focal.predictors = c("wave", "crisis", "Decade_f"), xlevels = list(wave = c(0, 1), crisis = c(0, 1))))
df.news.period$corpus <- ifelse(df.news.period$crisis == 1, "crisis rhetoric",
  ifelse(df.news.period$wave == 1, "crisis news wave",
    "routine"
  )
)

df.news.period <- subset(df.news.period, !(crisis == 1 & wave == 1))

ggplot(df.news.period, aes(y = fit, ymin = lower, ymax = upper, x = as.numeric(Decade_f), color = factor(corpus), fill = factor(corpus), shape = factor(corpus))) +
  geom_point(size = 3) +
  geom_ribbon(alpha = .5) +
  theme_soft()

aov_sscore_time0 <- lm(sscore ~ (Period_f) * PEOPLEWORDS_hi, data = docvars(t.dtm2))

df.people.period <- data.frame(Effect(aov_sscore_time0, focal.predictors = c("PEOPLEWORDS_hi", "Period_f"), xlevels = list(PEOPLEWORDS_hi = c(0, 1))))

ggplot(df.people.period, aes(y = fit, ymin = lower, ymax = upper, x = Period_f, color = factor(PEOPLEWORDS_hi), fill = factor(PEOPLEWORDS_hi), shape = factor(PEOPLEWORDS_hi), group = factor(PEOPLEWORDS_hi))) +
  geom_point(size = 3) +
  geom_ribbon(alpha = .5) +
  theme_soft()

aov_sscore_time0 <- lm(sscore ~ (Period_f) * wave * PEOPLEWORDS_hi + (Period_f) * crisis * PEOPLEWORDS_hi, data = docvars(t.dtm2))

df.crisis.period <- data.frame(Effect(aov_sscore_time0, focal.predictors = c("crisis", "wave", "Period_f", "PEOPLEWORDS_hi"), xlevels = list(PEOPLEWORDS_hi = c(0, 1), wave = c(0, 1), crisis = c(0, 1))))

df.crisis.period$corpus <- ifelse(df.crisis.period$crisis == 1, "crisis rhetoric",
  ifelse(df.crisis.period$wave == 1, "crisis news wave",
    "routine"
  )
)

ggplot(subset(df.crisis.period, !(crisis == 1 & wave == 1)), aes(y = fit, ymin = lower, ymax = upper, x = Period_f, color = factor(corpus), fill = factor(corpus), shape = factor(corpus), group = factor(corpus))) +
  geom_point(size = 3) +
  geom_ribbon(alpha = .5) +
  theme_soft() +
  facet_grid(. ~ PEOPLEWORDS_hi) +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  scale_color_viridis_d(end = .75) +
  scale_fill_viridis_d(end = .75) +
  ylim(-0.6, 0.6)


ggplot(df.people.period, aes(y = fit, ymin = lower, ymax = upper, x = Period_f, color = factor(PEOPLEWORDS_hi), fill = factor(PEOPLEWORDS_hi), shape = factor(PEOPLEWORDS_hi), group = factor(PEOPLEWORDS_hi))) +
  geom_point(size = 3) +
  geom_ribbon(alpha = .5) +
  theme_soft()



plot(Effect(aov_sscore_time0, focal.predictors = c("PEOPLEWORDS_hi", "Period_f")))


aov_sscore_time0 <- lm(sscore ~ (Period_f), data = docvars(t.dtm2))

plot(Effect(aov_sscore_time0, focal.predictors = c("Period_f")))

aov_sscore_time0 <- lm(sscore ~ (Decade_f), data = docvars(t.dtm2))

plot(Effect(aov_sscore_time0, focal.predictors = c("Decade_f")))

aov_sscore_struc2 <- lm(sscore ~ 0 * factor(Decade) + (wave / PEOPLEWORDS_hi / (DIS_hi + POL_hi + ECO_hi + EPI_hi + DOM_hi)) + (crisis / PEOPLEWORDS_hi / (DIS_hi + POL_hi + ECO_hi + EPI_hi + DOM_hi)) + PEOPLEWORDS_hi / (DIS_hi + POL_hi + ECO_hi + EPI_hi + DOM_hi) + (DIS_hi + POL_hi + ECO_hi + EPI_hi + DOM_hi), data = docvars(t.dtm2))

aov_sscore_time2 <- lm(sscore ~ (factor(Decade) / PEOPLEWORDS_hi / crisis) + (factor(Decade) / PEOPLEWORDS_hi / wave) + (wave / PEOPLEWORDS_hi / ((DIS_hi + POL_hi + ECO_hi + EPI_hi + DOM_hi))) + (crisis / PEOPLEWORDS_hi / (DIS_hi + POL_hi + ECO_hi + EPI_hi + DOM_hi)) + PEOPLEWORDS_hi / (DIS_hi + POL_hi + ECO_hi + EPI_hi + DOM_hi) + (DIS_hi + POL_hi + ECO_hi + EPI_hi + DOM_hi), data = docvars(t.dtm2))

aov_sscore_time3 <- lm(sscore ~ (factor(Period) / PEOPLEWORDS_hi / crisis) + (factor(Period) / PEOPLEWORDS_hi / wave) + (wave / PEOPLEWORDS_hi / ((DIS_hi + POL_hi + ECO_hi + EPI_hi + DOM_hi))) + (crisis / PEOPLEWORDS_hi / (DIS_hi + POL_hi + ECO_hi + EPI_hi + DOM_hi)) + PEOPLEWORDS_hi / (DIS_hi + POL_hi + ECO_hi + EPI_hi + DOM_hi) + (DIS_hi + POL_hi + ECO_hi + EPI_hi + DOM_hi), data = docvars(t.dtm2))

aov_sscore_struc <- lm(sscore ~ 0 * factor(Decade) + (wave / PEOPLEWORDS_hi / DIS_hi) + (crisis / PEOPLEWORDS_hi / DIS_hi) + PEOPLEWORDS_hi / DIS_hi + DIS_hi, data = docvars(t.dtm2))

aov_sscore_time <- lm(sscore ~ (factor(Decade) / PEOPLEWORDS_hi / crisis) + (factor(Decade) / PEOPLEWORDS_hi / wave) + (wave / PEOPLEWORDS_hi / DIS_hi) + (crisis / PEOPLEWORDS_hi / DIS_hi) + PEOPLEWORDS_hi / DIS_hi + DIS_hi, data = docvars(t.dtm2))

emmeans(aov_sscore, pairwise ~ wave | crisis | PEOPLEWORDS_hi | DIS_hi)

emmip(aov_sscore, DIS_hi ~ crisis + wave | PEOPLEWORDS_hi, CIs = TRUE)

plot(Effect(aov_sscore, focal.predictors = c("wave", "crisis", "PEOPLEWORDS_hi"), xlevels = list(wave = c(0, 1), crisis = c(0, 1), PEOPLEWORDS_hi = c(0, 1))))

marg_sscore <- margins::margins(aov_sscore)
me_sscore <- margins::marginal_effects(aov_sscore)
summary(marg_sscore)


docvars(t.dtm2)$sentiment <- t.valence$sentiment


lsx_dtm <- dfm_trim(t.dtm, min_docfreq = 100, docfreq_type = "count")

lsx_1900 <- dfm_subset(lsx_dtm, year < 1900)
lsx_1950 <- dfm_subset(lsx_dtm, year < 1950 & year > 1899)
lsx_1980 <- dfm_subset(lsx_dtm, year < 1980 & year > 1949)
lsx_2000 <- dfm_subset(lsx_dtm, year < 2000 & year > 1979)
lsx_2020 <- dfm_subset(lsx_dtm, year > 1999)

people_words_r1 <- c("crowd*", "people", "citizen*", "commoner*", "villager*", "resident*", "civilian*", "patient*", "recipient*", "consumer*", "taxpayer*", "masses", "protester*", "demonstrator*")

people_words_r2 <- c(
  "crowd*", "people", "citizen*", "commoner*", "villager*", "resident*", "civilian*", "patient*", "recipient*", "consumer*", "taxpayer*", "masses", "protester*", "demonstrator*",
  "fellow-citizen*", "thousands", "hundreds", "sightseer*", "victim*", "worshipper*", "heads", "patriot*", "compatriot*", "bandit*", "raider*", "mourner*", "bystander*", "spectator*", "inhabitant*", "cultivator*", "peasant*", "refugee*", "looter*", "fugitive*", "marauder*", "ruffian*", "revolutionaries", "rioter*", "passers-by", "corpses", "queue*", "homeless", "vagabond*", "inmate*", "criminal*", "soldier*", "insurgent*", "sufferer*", "barbarians", "marcher*", "mob*", "townspeople", "terrorist*", "onlooker*", "gang*", "tribesmen", "separatists", "junta*", "extremists", "thug*", "guerilla*", "anarchists", "militia*", "militant*", "activists"
)

people_words_r2n <- c(
  "crowd*", "people", "citizen*", "commoner*", "villager*", "resident*", "civilian*", "patient*", "recipient*", "consumer*", "taxpayer*", "masses", "protester*", "demonstrator*",
  "fellow-citizen*", "thousands", "hundreds", "sightseer*", "victim*", "worshipper*", "heads", "patriot*", "compatriot*", "mourner*", "bystander*", "spectator*", "inhabitant*", "cultivator*", "peasant*", "refugee*", "fugitive*", "passers-by", "queue*", "homeless", "vagabond*", "sufferer*", "marcher*", "mob*", "townspeople", "onlooker*", "activists"
)

people_2LLL_list <- dictionary(list(
  PEOPLEWORDS = c(
    "crowd*",
    "hundreds*",
    "thousands",
    "masses",
    "heads",
    "queue*",
    "mob*",
    "victim*",
    "sufferer*",
    "mourner*",
    "corpses",
    "citizen*",
    "commoner*",
    "civilian*",
    "taxpayer*",
    "fellow-citizen*",
    "patriot*",
    "compatriot*",
    "villager*",
    "resident*",
    "inhabitant*",
    "townspeople",
    "tribesmen",
    "people",
    "cultivator*",
    "peasant*",
    "bandit*",
    "raider*",
    "looter*",
    "marauder*",
    "ruffian*",
    "inmate*",
    "criminal*",
    "gang*",
    "thug*",
    "protester*",
    "demonstrator*",
    "marcher*",
    "activists",
    "worshipper*",
    "refugee*",
    "fugitive*",
    "homeless",
    "vagabond*",
    "revolutionaries",
    "rioter*",
    "insurgent*",
    "terrorist*",
    "separatists",
    "junta*",
    "extremists",
    "guerilla*",
    "anarchists",
    "militia*",
    "militant*",
    "patient*",
    "recipient*",
    "consumer*",
    "sightseer*",
    "bystander*",
    "spectator*",
    "passers-by",
    "onlooker*",
    "voter*",
    "constituenc*",
    "person*",
    "individual*",
    "slave*",
    "worker*",
    "labourer*",
    "workman*",
    "clerk*",
    "attendant*",
    "employee*",
    "operator*",
    "apprentice*",
    "hireling*",
    "salesperson*", "salesman", "salesmen", "saleswoman", "saleswomen",
    "servant*"
  )
))


people_2LL_list <- dictionary(list(
  CROWD = c(
    "crowd*",
    "hundreds*",
    "thousands",
    "masses",
    "heads",
    "queue*",
    "mob*"
  ),
  VICTIMS = c(
    "victim*",
    "sufferer*",
    "mourner*",
    "corpses",
    "affected",
    "afflicted",
  ),
  CITIZEN = c(
    "citizen*",
    "commoner*",
    "civilian*",
    "taxpayer*",
    "fellow-citizen*",
    "patriot*",
    "compatriot*",
    "voter*",
    "constituenc*",
    "tribesmen"
  ),
  RESIDENT = c(
    villager = "villager*",
    "resident*",
    "inhabitant*",
    "townspeople"
  ),
  PEOPLE = c(
    "people",
    "person*",
    "individual*"
  ),
  ECONOMIC = c(
    "cultivator*",
    "peasant*",
    "slave*",
    "worker*",
    "labourer*",
    "workman*",
    "clerk*",
    "attendant*",
    "employee*",
    "operator*",
    "apprentice*",
    "hireling*",
    "salesperson*", "salesman", "salesmen", "saleswoman", "saleswomen",
    "servant*"
  ),
  CRIMINALS = c(
    "bandit*",
    "raider*",
    "looter*",
    "marauder*",
    "ruffian*",
    "inmate*",
    "criminal*",
    "gang*",
    "thug*"
  ),
  PROTESTERS = c(
    "protester*",
    "demonstrator*",
    "marcher*",
    "activists"
  ),
  BELIEVERS = c(
    "worshipper*"
  ),
  REFUGEE = c(
    "refugee*",
    "fugitive*"
  ),
  POOR = c(
    "homeless",
    "vagabond*"
  ),
  INSURGENTS = c(
    "revolutionaries",
    "rioter*",
    "insurgent*",
    "terrorist*",
    "separatists",
    "junta*",
    "extremists",
    "guerilla*",
    "anarchists",
    "militia*",
    "militant*"
  ),
  PASSIVE = c(
    "patient*",
    "recipient*",
    "consumer*",
    "sightseer*",
    "bystander*",
    "spectator*",
    "passers-by",
    "onlooker*"
  )
))

people_2L_list <- dictionary(list(
  CROWD = list(
    crowd = "crowd*",
    hundreds = "hundreds*",
    thousands = "thousands",
    masses = "masses",
    heads = "heads",
    queue = "queue*",
    mob = "mob*"
  ),
  VICTIMS = list(
    victims = "victim*",
    sufferer = "sufferer*",
    mourner = "mourner*",
    corpses = "corpses"
  ),
  CITIZEN = list(
    citizen = "citizen*",
    commoner = "commoner*",
    civilian = "civilian*",
    taxpayer = "taxpayer*",
    fellow_citizen = "fellow-citizen*",
    patriot = "patriot*",
    compatriot = "compatriot*",
    voter = "voter*",
    constituency = "constituenc*",
    tribesmen = "tribesmen"
  ),
  RESIDENT = list(
    villager = "villager*",
    resident = "resident*",
    inhabitant = "inhabitant*",
    townspeople = "townspeople",
  ),
  PEOPLE = list(
    people = "people",
    individual = "individual*",
    person = "person*",
  ),
  ECONOMIC = list(
    cultivator = "cultivator*",
    peasant = "peasant*",
    slave = "slave*",
    worker = "worker*",
    labourer = "labourer*",
    workman = "workman*",
    clerk = "clerk*",
    attendant = "attendant*",
    employee = "employee*",
    operator = "operator*",
    apprentice = "apprentice*",
    hireling = "hireling*",
    salesperson = c("salesperson*", "salesman", "salesmen", "saleswoman", "saleswomen"),
    servant = "servant*"
  ),
  CRIMINALS = list(
    bandit = "bandit*",
    raider = "raider*",
    looter = "looter*",
    marauder = "marauder*",
    ruffian = "ruffian*",
    inmate = "inmate*",
    criminal = "criminal*",
    gang = "gang*",
    thug = "thug*"
  ),
  PROTESTERS = list(
    protester = "protester*",
    demonstrator = "demonstrator*",
    marcher = "marcher*",
    activists = "activists"
  ),
  BELIEVERS = list(
    worshipper = "worshipper*"
  ),
  REFUGEE = list(
    refugee = "refugee*",
    fugitive = "fugitive*"
  ),
  POOR = list(
    homeless = "homeless",
    vagabond = "vagabond*"
  ),
  INSURGENTS = list(
    revolutionaries = "revolutionaries",
    rioter = "rioter*",
    insurgent = "insurgent*",
    terrorist = "terrorist*",
    separatists = "separatists",
    junta = "junta*",
    extremists = "extremists",
    guerilla = "guerilla*",
    anarchists = "anarchists",
    militia = "militia*",
    militant = "militant*"
  ),
  PASSIVE = list(
    patient = "patient*",
    recipient = "recipient*",
    consumer = "consumer*",
    sightseer = "sightseer*",
    bystander = "bystander*",
    spectator = "spectator*",
    passers_by = "passers-by",
    onlooker = "onlooker*"
  )
))




people_list <- dictionary(list(
  crowd = "crowd*",
  people = "people",
  citizen = "citizen*",
  commoner = "commoner*",
  villager = "villager*",
  resident = "resident*",
  civilian = "civilian*",
  patient = "patient*",
  recipient = "recipient*",
  consumer = "consumer*",
  taxpayer = "taxpayer*",
  masses = "masses",
  protester = "protester*",
  demonstrator = "demonstrator*",
  fellow_citizen = "fellow-citizen*",
  thousands = "thousands",
  hundreds = "hundreds",
  sightseer = "sightseer*",
  victim = "victim*",
  worshipper = "worshipper*",
  heads = "heads",
  patriot = "patriot*",
  compatriot = "compatriot*",
  bandit = "bandit*",
  raider = "raider*",
  mourner = "mourner*",
  bystander = "bystander*",
  spectator = "spectator*",
  inhabitant = "inhabitant*",
  cultivator = "cultivator*",
  peasant = "peasant*",
  refugee = "refugee*",
  looter = "looter*",
  fugitive = "fugitive*",
  marauder = "marauder*",
  ruffian = "ruffian*",
  revolutionaries = "revolutionaries",
  rioter = "rioter*",
  passers_by = "passers-by",
  corpses = "corpses",
  queue = "queue*",
  homeless = "homeless",
  vagabond = "vagabond*",
  inmate = "inmate*",
  criminal = "criminal*",
  soldier = "soldier*",
  insurgent = "insurgent*",
  sufferer = "sufferer*",
  barbarians = "barbarians",
  marcher = "marcher*",
  mob = "mob*",
  townspeople = "townspeople",
  terrorist = "terrorist*",
  onlooker = "onlooker*",
  gang = "gang*",
  tribesmen = "tribesmen",
  separatists = "separatists",
  junta = "junta*",
  extremists = "extremists",
  thug = "thug*",
  guerilla = "guerilla*",
  anarchists = "anarchists",
  militia = "militia*",
  militant = "militant*",
  activists = "activists",
  individual = "individual*",
  person = "person*",
  voter = "voter*",
  constituency = "constituenc*",
  slave = "slave*",
  worker = "worker*",
  labourer = "labourer*",
  workman = "workman*",
  clerk = "clerk*",
  attendant = "attendant*",
  employee = "employee*",
  operator = "operator*",
  apprentice = "apprentice*",
  hireling = "hireling*",
  salesperson = c("salesperson*", "salesman", "salesmen", "saleswoman", "saleswomen"),
  servant = "servant*"
))

news_dict <- dictionary(list(
  publish = c("publisb*", "publish*", "publication*", "copie*", "copy*", "document"),
  print = c("papers", "journal", "newspapers", "newspaper", "print*", "gazette*", "column*", "page*", "weekly", "weeklie*", "magazin*", "book*", "memoir*"),
  journalist = c("correspondent*", "reporter*", "author*", "writer*", "editor*", "journalist*", "photographer*", "corresoondent", "presenter*"),
  production = c("write", "writing", "written", "wrote", "insert*", "text*", "articl*", "paragraph*", "film*", "story*"),
  content = c("news", "contain*", "extract*", "intelligence*", "information*", "describ*", "descrip*", "summary", "report", "reports", "newvs", "headline*", "coverage*", "pictur*", "photograph", "photographs", "preview", "word*", "video*", "meme*", "post*"),
  topic = c("concerning*", "refer*", "regard*", "sport*", "hashtag*"),
  genre = c("editorial", "obituary", "chronicl*", "feature*"),
  quality = c("interesting", "sensational", "classified", "reveal*", "trending*", "viral*"),
  credibility = c("authent*", "official*", "semi-official*", "signed*", "quot*", "rumour*", "anonymous*", "reassur*", "trustworth*", "distort*", "accuracy", "damning", "paraphras*"),
  statement = c("announc*", "say*", "stating", "statement*", "confirm*", "mention*", "messag*", "accord*", "emanat*", "edict*", "brief*", "talk*", "interview*", "allegation*", "criticism*", "criticise*", "inquiry"),
  paper_material = c("letter*", "receipt*", "handout*", "dossier*"),
  broadcast_material = c("newsreel*", "camera*", "footage"),
  online_material = c("email", "e-mail", "hotmail"),
  discourse = c("reply", "respon*", "comment*", "correspondenc*"),
  transmission = c("transmiss*", "transmit*", "forwarded", "communicat*", "broadcast*", "enclose*", "messeng*", "send*", "bulletin*", "dissem*", "record*"),
  transmission_postal = c("dispatch*", "despatch*", "mail", "letter*"),
  transmission_telegram = c("telegraph*", "telegram*", "relay*", "cable*"),
  transmission_broadcast = c("broadcast*", "newsreel*", "wireless", "radio*", "wvireless*", "televis*", "tv", "fm", "stereo", "channel*", "mhz", "khz", "programme*", "terrestrial*", "satellite*"),
  transmission_telephone = c("phone", "telephone", "call", "sms"),
  transmission_internet = c("broadcast*", "newsreel*", "digital", "internet", "web", "broadband", "websit*", "online", "skype", "multimedia", "fibre-optic*", "server*", "gadget*", "laptop", "app", "platform*", "upload*", "download*", "algorithm*", "curat*"),
  censorship = c("censo*"),
  audience = c("listener*", "audience*", "reader*", "public", "viewer*", "user*", "follower*"),
  paid = c("advertis*", "subscri*", "news-stand*", "newsstand*"),
  time = c("dated", "latest", "sept", "oct", "junx", "lately", "today*", "to-day*", "tonight*", "to-night*", "daily", "daili*"),
  institution = c("journalism*", "media"),
  institution_agency = c("havas", "agency", "agency:-", "bureau*", "reuters", "pergamon", "maxwells", "associated", "agenci*"),
  institution_newspapers = c("presse", "figaro", "blatt", "herald", "nachrichten"),
  institution_broadcast = c("bbc", "b.b.c", "b.b.c.", "itv", "bbc1", "morgenmagazin", "mittagsmagazin", "heute"),
  institution_socialmedia = c("google", "facebook", "instagram", "twitter", "youtube", "snapchat", "whatsapp", "tiktok"),
  regulation = c("ofcom", "licence", "license"),
  music = c("music", "jazz", "classical*", "pop", "tune*", "sing*", "recital*", "orchestra*", "concert*", "rock", "chart*", "song*", "joplin", "philharmonia", "symphonic"),
  network = c("network*", "social*", "interaction*")
))

news_xwl <- dictionary(list(news = c("news*", "paper*", "press*", "media*", "telev*", "radio*", "broadcast*", "bbc*", "itv*", "publication*", "publish", "correspon*", "writ*", "copy*", "copi*", "journal*", "letter*", "article*", "story*", "quot*", "gazett*", "document*", "havas*", "blatt*", "official*", "*official", "text*", "figaro", "tv", "newsroom*", "report*", "edition*", "dispatch*", "despatch*", "rumour", "announce*", "agency", "bulletin*", "semi-official*", "comment*", "communicat*", "telegram*", "telegraph*", "censor*", "transmi*", "messeng*", "bureau*", "text*", "photo*", "cover*", "magazin*", "inform*", "leak*", "reuters", "headline", "story", "stori*", "film*", "network*", "weeklies", "weekly", "interview*", "briefing*", "release")))

tlo_common_cc <- tokens_lookup(tms_tk_su, dictionary = people_list, nomatch = "NOMATCH")
tlo_common_nc <- tokens_lookup(nc.tok, dictionary = people_list, nomatch = "NOMATCH")

kwic_cc_1780 <- tokens_subset(tms_tk_su, year > 1779 & year < 1800)
kwic_cc_1800 <- tokens_subset(tms_tk_su, year > 1799 & year < 1820)
kwic_cc_1820 <- tokens_subset(tms_tk_su, year > 1819 & year < 1840)
kwic_cc_1840 <- tokens_subset(tms_tk_su, year > 1839 & year < 1860)
kwic_cc_1860 <- tokens_subset(tms_tk_su, year > 1859 & year < 1880)
kwic_cc_1880 <- tokens_subset(tms_tk_su, year > 1879 & year < 1900)
kwic_cc_1900 <- tokens_subset(tms_tk_su, year > 1899 & year < 1920)
kwic_cc_1920 <- tokens_subset(tms_tk_su, year > 1919 & year < 1940)
kwic_cc_1940 <- tokens_subset(tms_tk_su, year > 1939 & year < 1960)
kwic_cc_1960 <- tokens_subset(tms_tk_su, year > 1959 & year < 1980)
kwic_cc_1980 <- tokens_subset(tms_tk_su, year > 1979 & year < 2000)
kwic_cc_2000 <- tokens_subset(tms_tk_su, year > 1999 & year < 2021)

kwic_nc_1780 <- tokens_subset(nc.tok, year > 1779 & year < 1800)
kwic_nc_1800 <- tokens_subset(nc.tok, year > 1799 & year < 1820)
kwic_nc_1820 <- tokens_subset(nc.tok, year > 1819 & year < 1840)
kwic_nc_1840 <- tokens_subset(nc.tok, year > 1839 & year < 1860)
kwic_nc_1860 <- tokens_subset(nc.tok, year > 1859 & year < 1880)
kwic_nc_1880 <- tokens_subset(nc.tok, year > 1879 & year < 1900)
kwic_nc_1900 <- tokens_subset(nc.tok, year > 1899 & year < 1920)
kwic_nc_1920 <- tokens_subset(nc.tok, year > 1919 & year < 1940)
kwic_nc_1940 <- tokens_subset(nc.tok, year > 1939 & year < 1960)
kwic_nc_1960 <- tokens_subset(nc.tok, year > 1959 & year < 1980)
kwic_nc_1980 <- tokens_subset(nc.tok, year > 1979 & year < 2000)
kwic_nc_2000 <- tokens_subset(nc.tok, year > 1999 & year < 2021)

kwics_cc_1780 <- tokens_sample(kwic_cc_1780, size = 100)
kwics_cc_1800 <- tokens_sample(kwic_cc_1800, size = 100)
kwics_cc_1820 <- tokens_sample(kwic_cc_1820, size = 100)
kwics_cc_1840 <- tokens_sample(kwic_cc_1840, size = 100)
kwics_cc_1860 <- tokens_sample(kwic_cc_1860, size = 100)
kwics_cc_1880 <- tokens_sample(kwic_cc_1880, size = 100)
kwics_cc_1900 <- tokens_sample(kwic_cc_1900, size = 100)
kwics_cc_1920 <- tokens_sample(kwic_cc_1920, size = 100)
kwics_cc_1940 <- tokens_sample(kwic_cc_1940, size = 100)
kwics_cc_1960 <- tokens_sample(kwic_cc_1960, size = 100)
kwics_cc_1980 <- tokens_sample(kwic_cc_1980, size = 100)
kwics_cc_2000 <- tokens_sample(kwic_cc_2000, size = 100)

kwics_nc_1780 <- tokens_sample(kwic_nc_1780, size = 100)
kwics_nc_1800 <- tokens_sample(kwic_nc_1800, size = 100)
kwics_nc_1820 <- tokens_sample(kwic_nc_1820, size = 100)
kwics_nc_1840 <- tokens_sample(kwic_nc_1840, size = 100)
kwics_nc_1860 <- tokens_sample(kwic_nc_1860, size = 100)
kwics_nc_1880 <- tokens_sample(kwic_nc_1880, size = 100)
kwics_nc_1900 <- tokens_sample(kwic_nc_1900, size = 100)
kwics_nc_1920 <- tokens_sample(kwic_nc_1920, size = 100)
kwics_nc_1940 <- tokens_sample(kwic_nc_1940, size = 100)
kwics_nc_1960 <- tokens_sample(kwic_nc_1960, size = 100)
kwics_nc_1980 <- tokens_sample(kwic_nc_1980, size = 100)
kwics_nc_2000 <- tokens_sample(kwic_nc_2000, size = 100)

w50_cc_1780 <- kwic(x = kwics_cc_1780, pattern = people_list, window = 100)

x <- w50_cc_1780

kwics_cc <- c("kwics_cc_1780", "kwics_cc_1800", "kwics_cc_1820", "kwics_cc_1840", "kwics_cc_1860", "kwics_cc_1880", "kwics_cc_1900", "kwics_cc_1920", "kwics_cc_1940", "kwics_cc_1960", "kwics_cc_1980", "kwics_cc_2000")

kwics_nc <- c("kwics_nc_1780", "kwics_nc_1800", "kwics_nc_1820", "kwics_nc_1840", "kwics_nc_1860", "kwics_nc_1880", "kwics_nc_1900", "kwics_nc_1920", "kwics_nc_1940", "kwics_nc_1960", "kwics_nc_1980", "kwics_nc_2000")


for (i in 1:12) {
  xx <- (kwic(x = get(kwics_cc[i]), pattern = people_list, window = 100))
  yy <- (kwic(x = get(kwics_nc[i]), pattern = people_list, window = 100))
  x <- xx[round(runif(n = 100, min = 1, max = min(c(100, dim(xx)[1]), na.rm = TRUE)), 0), ]
  y <- yy[round(runif(n = 100, min = 1, max = min(c(100, dim(yy)[1]), na.rm = TRUE)), 0), ]

  print(kwics_cc[i])
  cat(paste("\n\n\n", 1:dim(x)[1], str_to_upper(x$keyword), "**********", docvars(tms_tk_su)[match(x$docname, docnames(tms_tk_su)), "date"], "*****", docvars(tms_tk_su)[match(x$docname, docnames(tms_tk_su)), "headline"], "**********\n [...]", str_wrap(x$pre, 100), "\n**********", x$keyword, "**********\n", str_wrap(x$post, 100), "[...]\n"), 100)
  print(kwics_nc[i])
  cat(paste("\n\n\n", 1:dim(y)[1], str_to_upper(y$keyword), "**********", docvars(nc.tok)[match(y$docname, docnames(nc.tok)), "date"], "*****", docvars(tms_tk_su)[match(x$docname, docnames(tms_tk_su)), "headline"], "**********\n [...]", str_wrap(y$pre, 100), "\n**********", y$keyword, "**********\n", str_wrap(y$post, 100), "[...]\n"), 100)
  flush.console()
}



writeLines(strwrap(paste(wave.114175.texts[[1]], collapse = " "), 100))
cat(w50_cc_1780)



look_dfm_2L <- dfm_lookup(x = lsx_dtm, dictionary = people_2L_list, nomatch = "NOMATCH")

look_dfm_2LL <- dfm_lookup(x = lsx_dtm, dictionary = people_2LL_list, nomatch = "NOMATCH")

look_dfm_2LLL <- dfm_lookup(x = lsx_dtm, dictionary = people_2LLL_list, nomatch = "NOMATCH")

look_cdfm_2LLL <- dfm_subset(look_dfm_2LLL, Corpus == "Crisis")
look_ndfm_2LLL <- dfm_subset(look_dfm_2LLL, Corpus == "Noncrisis")

look_cdfm_2LL <- dfm_subset(look_dfm_2LL, Corpus == "Crisis")
look_ndfm_2LL <- dfm_subset(look_dfm_2LL, Corpus == "Noncrisis")

total_to_cc <- match(docvars(t.dtm2)$id, docvars(look_cdfm_2LLL)$id)
cc_to_total <- match(docvars(look_cdfm_2LLL)$id, docvars(t.dtm2)$id)
total_to_nc <- match(docvars(t.dtm2)$id, docvars(look_ndfm_2LLL)$id)
total_to_nc2 <- match(docvars(t.dtm2)$id, docvars(look_ndfm_2LLL)$id)[187573:306982]
nc_to_total <- match(docvars(look_ndfm_2LLL)$id, docvars(t.dtm2)$id)

docvars(t.dtm2)$PEOPLEWORDS <- NA
docvars(t.dtm2)[1:187573, "PEOPLEWORDS"] <- look_cdfm_2LLL[cc_to_total[!is.na(cc_to_total)], 1]
docvars(t.dtm2)[187574:306982, "PEOPLEWORDS"] <- look_ndfm_2LLL[total_to_nc2[!is.na(total_to_nc2)], 1]

docvars(t.dtm2)$NOMATCH <- NA
docvars(t.dtm2)[1:187573, "NOMATCH"] <- look_cdfm_2LLL[cc_to_total[!is.na(cc_to_total)], 2]
docvars(t.dtm2)[187574:306982, "NOMATCH"] <- look_ndfm_2LLL[total_to_nc2[!is.na(total_to_nc2)], 2]

docvars(t.dtm2)$CROWD <- NA
docvars(t.dtm2)[1:187573, "CROWD"] <- look_cdfm_2LL[cc_to_total[!is.na(cc_to_total)], 1]
docvars(t.dtm2)[187574:306982, "CROWD"] <- look_ndfm_2LL[total_to_nc2[!is.na(total_to_nc2)], 1]

docvars(t.dtm2)$VICTIMS <- NA
docvars(t.dtm2)[1:187573, "VICTIMS"] <- look_cdfm_2LL[cc_to_total[!is.na(cc_to_total)], 2]
docvars(t.dtm2)[187574:306982, "VICTIMS"] <- look_ndfm_2LL[total_to_nc2[!is.na(total_to_nc2)], 2]

docvars(t.dtm2)$CITIZEN <- NA
docvars(t.dtm2)[1:187573, "CITIZEN"] <- look_cdfm_2LL[cc_to_total[!is.na(cc_to_total)], 3]
docvars(t.dtm2)[187574:306982, "CITIZEN"] <- look_ndfm_2LL[total_to_nc2[!is.na(total_to_nc2)], 3]

docvars(t.dtm2)$RESIDENT <- NA
docvars(t.dtm2)[1:187573, "RESIDENT"] <- look_cdfm_2LL[cc_to_total[!is.na(cc_to_total)], 4]
docvars(t.dtm2)[187574:306982, "RESIDENT"] <- look_ndfm_2LL[total_to_nc2[!is.na(total_to_nc2)], 4]

docvars(t.dtm2)$PEOPLE <- NA
docvars(t.dtm2)[1:187573, "PEOPLE"] <- look_cdfm_2LL[cc_to_total[!is.na(cc_to_total)], 5]
docvars(t.dtm2)[187574:306982, "PEOPLE"] <- look_ndfm_2LL[total_to_nc2[!is.na(total_to_nc2)], 5]

docvars(t.dtm2)$ECONOMIC <- NA
docvars(t.dtm2)[1:187573, "ECONOMIC"] <- look_cdfm_2LL[cc_to_total[!is.na(cc_to_total)], 6]
docvars(t.dtm2)[187574:306982, "ECONOMIC"] <- look_ndfm_2LL[total_to_nc2[!is.na(total_to_nc2)], 6]

docvars(t.dtm2)$CRIMINALS <- NA
docvars(t.dtm2)[1:187573, "CRIMINALS"] <- look_cdfm_2LL[cc_to_total[!is.na(cc_to_total)], 7]
docvars(t.dtm2)[187574:306982, "CRIMINALS"] <- look_ndfm_2LL[total_to_nc2[!is.na(total_to_nc2)], 7]

docvars(t.dtm2)$PROTESTERS <- NA
docvars(t.dtm2)[1:187573, "PROTESTERS"] <- look_cdfm_2LL[cc_to_total[!is.na(cc_to_total)], 8]
docvars(t.dtm2)[187574:306982, "PROTESTERS"] <- look_ndfm_2LL[total_to_nc2[!is.na(total_to_nc2)], 8]

docvars(t.dtm2)$BELIEVERS <- NA
docvars(t.dtm2)[1:187573, "BELIEVERS"] <- look_cdfm_2LL[cc_to_total[!is.na(cc_to_total)], 9]
docvars(t.dtm2)[187574:306982, "BELIEVERS"] <- look_ndfm_2LL[total_to_nc2[!is.na(total_to_nc2)], 9]

docvars(t.dtm2)$REFUGEE <- NA
docvars(t.dtm2)[1:187573, "REFUGEE"] <- look_cdfm_2LL[cc_to_total[!is.na(cc_to_total)], 10]
docvars(t.dtm2)[187574:306982, "REFUGEE"] <- look_ndfm_2LL[total_to_nc2[!is.na(total_to_nc2)], 10]

docvars(t.dtm2)$POOR <- NA
docvars(t.dtm2)[1:187573, "POOR"] <- look_cdfm_2LL[cc_to_total[!is.na(cc_to_total)], 11]
docvars(t.dtm2)[187574:306982, "POOR"] <- look_ndfm_2LL[total_to_nc2[!is.na(total_to_nc2)], 11]

docvars(t.dtm2)$INSURGENTS <- NA
docvars(t.dtm2)[1:187573, "INSURGENTS"] <- look_cdfm_2LL[cc_to_total[!is.na(cc_to_total)], 12]
docvars(t.dtm2)[187574:306982, "INSURGENTS"] <- look_ndfm_2LL[total_to_nc2[!is.na(total_to_nc2)], 12]

docvars(t.dtm2)$PASSIVE <- NA
docvars(t.dtm2)[1:187573, "PASSIVE"] <- look_cdfm_2LL[cc_to_total[!is.na(cc_to_total)], 13]
docvars(t.dtm2)[187574:306982, "PASSIVE"] <- look_ndfm_2LL[total_to_nc2[!is.na(total_to_nc2)], 13]

docvars(t.dtm2)$TOTALWORD <- docvars(t.dtm2)$PEOPLEWORDS + docvars(t.dtm2)$NOMATCH

docvars(t.dtm2)$PEOPLEWORDS_rfreq <- 1000 * docvars(t.dtm2)$PEOPLEWORDS / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$CROWD_rfreq <- 1000 * docvars(t.dtm2)$CROWD / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$VICTIMS_rfreq <- 1000 * docvars(t.dtm2)$VICTIMS / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$CITIZEN_rfreq <- 1000 * docvars(t.dtm2)$CITIZEN / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$RESIDENT_rfreq <- 1000 * docvars(t.dtm2)$RESIDENT / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$PEOPLE_rfreq <- 1000 * docvars(t.dtm2)$PEOPLE / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$ECONOMIC_rfreq <- 1000 * docvars(t.dtm2)$ECONOMIC / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$CRIMINALS_rfreq <- 1000 * docvars(t.dtm2)$CRIMINALS / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$PROTESTERS_rfreq <- 1000 * docvars(t.dtm2)$PROTESTERS / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$BELIEVERS_rfreq <- 1000 * docvars(t.dtm2)$BELIEVERS / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$REFUGEE_rfreq <- 1000 * docvars(t.dtm2)$REFUGEE / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$POOR_rfreq <- 1000 * docvars(t.dtm2)$POOR / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$INSURGENTS_rfreq <- 1000 * docvars(t.dtm2)$INSURGENTS / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$PASSIVE_rfreq <- 1000 * docvars(t.dtm2)$PASSIVE / docvars(t.dtm2)$TOTALWORD
docvars(t.dtm2)$finite <- is.finite(docvars(t.dtm2)$PEOPLEWORDS_rfreq) & is.finite(docvars(t.dtm2)$GEO)

with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(PEOPLEWORDS_rfreq, sentiment))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(CROWD_rfreq, sentiment))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(VICTIMS_rfreq, sentiment))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(CITIZEN_rfreq, sentiment))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(RESIDENT_rfreq, sentiment))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(PEOPLE_rfreq, sentiment))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(ECONOMIC_rfreq, sentiment))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(CRIMINALS_rfreq, sentiment))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(PROTESTERS_rfreq, sentiment))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(BELIEVERS_rfreq, sentiment))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(REFUGEE_rfreq, sentiment))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(POOR_rfreq, sentiment))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(INSURGENTS_rfreq, sentiment))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(PASSIVE_rfreq, sentiment))


ggplot(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], aes(y = sentiment, x = as.numeric(year), color = (PEOPLEWORDS_rfreq > 10))) +
  geom_smooth() +
  facet_grid(. ~ Corpus) +
  theme_soft()
ggplot(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], aes(y = sentiment, x = as.numeric(year), color = (PROTESTERS_rfreq > 1))) +
  geom_smooth() +
  facet_grid(. ~ Corpus) +
  theme_soft()
ggplot(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], aes(y = sentiment, x = as.numeric(year), color = (VICTIMS_rfreq > 1))) +
  geom_smooth() +
  facet_grid(. ~ Corpus) +
  theme_soft()
ggplot(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], aes(y = sentiment, x = as.numeric(year), color = (CROWD_rfreq > 1))) +
  geom_smooth() +
  facet_grid(. ~ Corpus) +
  theme_soft()
ggplot(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], aes(y = sentiment, x = as.numeric(year), color = (CITIZEN_rfreq > 1))) +
  geom_smooth() +
  facet_grid(. ~ Corpus) +
  theme_soft()
ggplot(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], aes(y = sentiment, x = as.numeric(year), color = (POOR_rfreq > 1))) +
  geom_smooth() +
  facet_grid(. ~ Corpus) +
  theme_soft()
ggplot(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], aes(y = sentiment, x = as.numeric(year), color = (PEOPLE_rfreq > 1))) +
  geom_smooth() +
  facet_grid(. ~ Corpus) +
  theme_soft()



with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(INSURGENTS_rfreq, GEO))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(INSURGENTS_rfreq, POL))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(VICTIMS_rfreq, DIS))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(REFUGEE_rfreq, DIS))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(REFUGEE_rfreq, MIL))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(CITIZEN_rfreq, POL))
with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor.test(POOR_rfreq, POL))

topic_common_corr <- round(cbind(
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor(cbind(CROWD_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor(cbind(VICTIMS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor(cbind(CITIZEN_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor(cbind(RESIDENT_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor(cbind(PEOPLE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor(cbind(ECONOMIC_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor(cbind(CRIMINALS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor(cbind(PROTESTERS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor(cbind(BELIEVERS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor(cbind(REFUGEE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor(cbind(INSURGENTS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor(cbind(PASSIVE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1]
), 3)

colnames(topic_common_corr) <- c("CROWD", "VICTIMS", "CITIZEN", "RESIDENT", "PEOPLE", "ECONOMIC", "CRIMINALS", "PROTESTERS", "BELIEVERS", "REFUGEE", "INSURGENTS", "PASSIVE")

topic_common_corr_cc <- round(cbind(
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Crisis", ], cor(cbind(CROWD_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Crisis", ], cor(cbind(VICTIMS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Crisis", ], cor(cbind(CITIZEN_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Crisis", ], cor(cbind(RESIDENT_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Crisis", ], cor(cbind(PEOPLE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Crisis", ], cor(cbind(ECONOMIC_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Crisis", ], cor(cbind(CRIMINALS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Crisis", ], cor(cbind(PROTESTERS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Crisis", ], cor(cbind(BELIEVERS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Crisis", ], cor(cbind(REFUGEE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Crisis", ], cor(cbind(INSURGENTS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Crisis", ], cor(cbind(PASSIVE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1]
), 3)

colnames(topic_common_corr_cc) <- c("CROWD", "VICTIMS", "CITIZEN", "RESIDENT", "PEOPLE", "ECONOMIC", "CRIMINALS", "PROTESTERS", "BELIEVERS", "REFUGEE", "INSURGENTS", "PASSIVE")

topic_common_corr_nc <- round(cbind(
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Noncrisis", ], cor(cbind(CROWD_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Noncrisis", ], cor(cbind(VICTIMS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Noncrisis", ], cor(cbind(CITIZEN_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Noncrisis", ], cor(cbind(RESIDENT_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Noncrisis", ], cor(cbind(PEOPLE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Noncrisis", ], cor(cbind(ECONOMIC_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Noncrisis", ], cor(cbind(CRIMINALS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Noncrisis", ], cor(cbind(PROTESTERS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Noncrisis", ], cor(cbind(BELIEVERS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Noncrisis", ], cor(cbind(REFUGEE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Noncrisis", ], cor(cbind(INSURGENTS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Corpus == "Noncrisis", ], cor(cbind(PASSIVE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL)))[, 1]
), 3)

colnames(topic_common_corr_nc) <- c("CROWD", "VICTIMS", "CITIZEN", "RESIDENT", "PEOPLE", "ECONOMIC", "CRIMINALS", "PROTESTERS", "BELIEVERS", "REFUGEE", "INSURGENTS", "PASSIVE")
docvars(t.dtm2)$Year <- as.numeric(docvars(t.dtm2)$year)


topic_common_corr_1900 <- round(cbind(
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1900, ], cor(cbind(CROWD_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1900, ], cor(cbind(VICTIMS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1900, ], cor(cbind(CITIZEN_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1900, ], cor(cbind(RESIDENT_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1900, ], cor(cbind(PEOPLE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1900, ], cor(cbind(ECONOMIC_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1900, ], cor(cbind(CRIMINALS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1900, ], cor(cbind(PROTESTERS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1900, ], cor(cbind(BELIEVERS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1900, ], cor(cbind(REFUGEE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1900, ], cor(cbind(INSURGENTS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1900, ], cor(cbind(PASSIVE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1]
), 3)

colnames(topic_common_corr_1900) <- c("CROWD", "VICTIMS", "CITIZEN", "RESIDENT", "PEOPLE", "ECONOMIC", "CRIMINALS", "PROTESTERS", "BELIEVERS", "REFUGEE", "INSURGENTS", "PASSIVE")

topic_common_corr_1950 <- round(cbind(
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1950 & docvars(t.dtm2)$Year > 1899, ], cor(cbind(CROWD_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1950 & docvars(t.dtm2)$Year > 1899, ], cor(cbind(VICTIMS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1950 & docvars(t.dtm2)$Year > 1899, ], cor(cbind(CITIZEN_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1950 & docvars(t.dtm2)$Year > 1899, ], cor(cbind(RESIDENT_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1950 & docvars(t.dtm2)$Year > 1899, ], cor(cbind(PEOPLE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1950 & docvars(t.dtm2)$Year > 1899, ], cor(cbind(ECONOMIC_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1950 & docvars(t.dtm2)$Year > 1899, ], cor(cbind(CRIMINALS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1950 & docvars(t.dtm2)$Year > 1899, ], cor(cbind(PROTESTERS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1950 & docvars(t.dtm2)$Year > 1899, ], cor(cbind(BELIEVERS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1950 & docvars(t.dtm2)$Year > 1899, ], cor(cbind(REFUGEE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1950 & docvars(t.dtm2)$Year > 1899, ], cor(cbind(INSURGENTS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1950 & docvars(t.dtm2)$Year > 1899, ], cor(cbind(PASSIVE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1]
), 3)

colnames(topic_common_corr_1950) <- c("CROWD", "VICTIMS", "CITIZEN", "RESIDENT", "PEOPLE", "ECONOMIC", "CRIMINALS", "PROTESTERS", "BELIEVERS", "REFUGEE", "INSURGENTS", "PASSIVE")

topic_common_corr_1980 <- round(cbind(
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1980 & docvars(t.dtm2)$Year > 1949, ], cor(cbind(CROWD_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1980 & docvars(t.dtm2)$Year > 1949, ], cor(cbind(VICTIMS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1980 & docvars(t.dtm2)$Year > 1949, ], cor(cbind(CITIZEN_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1980 & docvars(t.dtm2)$Year > 1949, ], cor(cbind(RESIDENT_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1980 & docvars(t.dtm2)$Year > 1949, ], cor(cbind(PEOPLE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1980 & docvars(t.dtm2)$Year > 1949, ], cor(cbind(ECONOMIC_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1980 & docvars(t.dtm2)$Year > 1949, ], cor(cbind(CRIMINALS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1980 & docvars(t.dtm2)$Year > 1949, ], cor(cbind(PROTESTERS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1980 & docvars(t.dtm2)$Year > 1949, ], cor(cbind(BELIEVERS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1980 & docvars(t.dtm2)$Year > 1949, ], cor(cbind(REFUGEE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1980 & docvars(t.dtm2)$Year > 1949, ], cor(cbind(INSURGENTS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 1980 & docvars(t.dtm2)$Year > 1949, ], cor(cbind(PASSIVE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1]
), 3)

colnames(topic_common_corr_1980) <- c("CROWD", "VICTIMS", "CITIZEN", "RESIDENT", "PEOPLE", "ECONOMIC", "CRIMINALS", "PROTESTERS", "BELIEVERS", "REFUGEE", "INSURGENTS", "PASSIVE")

topic_common_corr_2000 <- round(cbind(
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2000 & docvars(t.dtm2)$Year > 1979, ], cor(cbind(CROWD_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2000 & docvars(t.dtm2)$Year > 1979, ], cor(cbind(VICTIMS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2000 & docvars(t.dtm2)$Year > 1979, ], cor(cbind(CITIZEN_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2000 & docvars(t.dtm2)$Year > 1979, ], cor(cbind(RESIDENT_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2000 & docvars(t.dtm2)$Year > 1979, ], cor(cbind(PEOPLE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2000 & docvars(t.dtm2)$Year > 1979, ], cor(cbind(ECONOMIC_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2000 & docvars(t.dtm2)$Year > 1979, ], cor(cbind(CRIMINALS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2000 & docvars(t.dtm2)$Year > 1979, ], cor(cbind(PROTESTERS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2000 & docvars(t.dtm2)$Year > 1979, ], cor(cbind(BELIEVERS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2000 & docvars(t.dtm2)$Year > 1979, ], cor(cbind(REFUGEE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2000 & docvars(t.dtm2)$Year > 1979, ], cor(cbind(INSURGENTS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2000 & docvars(t.dtm2)$Year > 1979, ], cor(cbind(PASSIVE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1]
), 3)

colnames(topic_common_corr_2000) <- c("CROWD", "VICTIMS", "CITIZEN", "RESIDENT", "PEOPLE", "ECONOMIC", "CRIMINALS", "PROTESTERS", "BELIEVERS", "REFUGEE", "INSURGENTS", "PASSIVE")

topic_common_corr_2020 <- round(cbind(
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2021 & docvars(t.dtm2)$Year > 1999, ], cor(cbind(CROWD_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2021 & docvars(t.dtm2)$Year > 1999, ], cor(cbind(VICTIMS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2021 & docvars(t.dtm2)$Year > 1999, ], cor(cbind(CITIZEN_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2021 & docvars(t.dtm2)$Year > 1999, ], cor(cbind(RESIDENT_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2021 & docvars(t.dtm2)$Year > 1999, ], cor(cbind(PEOPLE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2021 & docvars(t.dtm2)$Year > 1999, ], cor(cbind(ECONOMIC_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2021 & docvars(t.dtm2)$Year > 1999, ], cor(cbind(CRIMINALS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2021 & docvars(t.dtm2)$Year > 1999, ], cor(cbind(PROTESTERS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2021 & docvars(t.dtm2)$Year > 1999, ], cor(cbind(BELIEVERS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2021 & docvars(t.dtm2)$Year > 1999, ], cor(cbind(REFUGEE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2021 & docvars(t.dtm2)$Year > 1999, ], cor(cbind(INSURGENTS_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1],
  with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE & docvars(t.dtm2)$Year < 2021 & docvars(t.dtm2)$Year > 1999, ], cor(cbind(PASSIVE_rfreq, DIS, DOM, ECO, EDU, ENV, EPI, FAM, GEO, HEA, INF, LAB, LEI, MIL, OTH, POL, PUB, REL, SCI, TRA, WEL), use = "complete.obs"))[, 1]
), 3)

colnames(topic_common_corr_2020) <- c("CROWD", "VICTIMS", "CITIZEN", "RESIDENT", "PEOPLE", "ECONOMIC", "CRIMINALS", "PROTESTERS", "BELIEVERS", "REFUGEE", "INSURGENTS", "PASSIVE")

round(cbind(with(docvars(t.dtm2)[docvars(t.dtm2)$finite == TRUE, ], cor(cbind(CROWD_rfreq, VICTIMS_rfreq, CITIZEN_rfreq, RESIDENT_rfreq, PEOPLE_rfreq, ECONOMIC_rfreq, CRIMINALS_rfreq, PROTESTERS_rfreq, BELIEVERS_rfreq, REFUGEE_rfreq, INSURGENTS_rfreq, PASSIVE_rfreq)))), 3)

df_topic_common_corr <- data.frame(
  correlation = c(
    pivot_longer(data.frame(topic_common_corr), cols = 1:12)$value,
    pivot_longer(data.frame(topic_common_corr_cc), cols = 1:12)$value,
    pivot_longer(data.frame(topic_common_corr_nc), cols = 1:12)$value,
    pivot_longer(data.frame(topic_common_corr_1900), cols = 1:12)$value,
    pivot_longer(data.frame(topic_common_corr_1950), cols = 1:12)$value,
    pivot_longer(data.frame(topic_common_corr_1980), cols = 1:12)$value,
    pivot_longer(data.frame(topic_common_corr_2000), cols = 1:12)$value,
    pivot_longer(data.frame(topic_common_corr_2020), cols = 1:12)$value
  ),
  common_people = c(
    pivot_longer(data.frame(topic_common_corr), cols = 1:12)$name,
    pivot_longer(data.frame(topic_common_corr_cc), cols = 1:12)$name,
    pivot_longer(data.frame(topic_common_corr_nc), cols = 1:12)$name,
    pivot_longer(data.frame(topic_common_corr_1900), cols = 1:12)$name,
    pivot_longer(data.frame(topic_common_corr_1950), cols = 1:12)$name,
    pivot_longer(data.frame(topic_common_corr_1980), cols = 1:12)$name,
    pivot_longer(data.frame(topic_common_corr_2000), cols = 1:12)$name,
    pivot_longer(data.frame(topic_common_corr_2020), cols = 1:12)$name
  ),
  topic = rep(rep(rownames(topic_common_corr), each = 12), times = 8),
  corpus = rep(c("both", "crisis", "noncrisis", "both", "both", "both", "both", "both"), each = 252),
  period = factor(rep(c("1785-2020", "1785-2020", "1785-2020", "1785-1899", "1900-1949", "1950-1979", "1980-1999", "2000-2020"), each = 252), ordered = TRUE, levels = c("1785-2020", "1785-1899", "1900-1949", "1950-1979", "1980-1999", "2000-2020"))
)
df_topic_common_corr2 <- subset(df_topic_common_corr, topic != "CROWD_rfreq")

ggplot(df_topic_common_corr2, aes(y = correlation, x = topic, color = corpus, fill = corpus)) +
  geom_point() +
  facet_grid(period ~ common_people)

ggplot(df_topic_common_corr2, aes(y = correlation, x = period, color = corpus, fill = corpus)) +
  geom_point() +
  facet_grid(common_people ~ topic) +
  theme_soft() +
  geom_hline(color = "red", yintercept = 0, linetype = "dashed")




topic_common_corr[-1, ]
topic_common_corr_cc[-1, ]
topic_common_corr_nc[-1, ]


df_GEO_PEOPLE <- data.frame(x = docvars(t.dtm2)$PEOPLEWORDS_rfreq, y = docvars(t.dtm2)$GEO)
df_GEO_PEOPLE$finite <- is.finite(df_GEO_PEOPLE[, 1]) & is.finite(df_GEO_PEOPLE[, 2])

cor(df_GEO_PEOPLE[df_GEO_PEOPLE$finite == TRUE, 1:2])

cor.test(df_GEO_PEOPLE[df_GEO_PEOPLE$finite == TRUE, 1], df_GEO_PEOPLE[df_GEO_PEOPLE$finite == TRUE, 2])


ggplot(df_GEO_PEOPLE, aes(x = x, y = y)) +
  geom_smooth(method = "lm")


df_GEO_INSURG <- data.frame(x = docvars(t.dtm2)$INSURGENTS_rfreq, y = docvars(t.dtm2)$GEO)
df_GEO_PEOPLE$finite <- is.finite(df_GEO_PEOPLE[, 1]) & is.finite(df_GEO_PEOPLE[, 2])


ggplot(df_GEO_INSURG, aes(x = x, y = y)) +
  geom_smooth(method = "lm")

cor.test(df_GEO_PEOPLE)

















cbind(as.numeric(docvars(t.dtm2)[187574:306982, "PEOPLEWORDS"]), as.numeric(look_ndfm_2LLL[, 1]))


docvars(t.dtm2)[match(docvars(t.dtm2)$id, docvars(look_cdfm_2LLL)$id), "PEOPLEWORDS"] <- look_cdfm_2LLL[, "PEOPLEWORDS"]


year_df <- data.frame(year = 1785:2020)

crowd_ref <- data.frame(crowd_ref = tapply(rowSums(look_dfm_2LL[, "CROWD"] > (0)), docvars(look_dfm_2LL)$year, sum, na.rm = TRUE))

victims_ref <- data.frame(victims_ref = tapply(rowSums(look_dfm_2LL[, "VICTIMS"] > (0)), docvars(look_dfm_2LL)$year, sum, na.rm = TRUE))

citizen_ref <- data.frame(citizen_ref = tapply(rowSums(look_dfm_2LL[, "CITIZEN"] > (0)), docvars(look_dfm_2LL)$year, sum, na.rm = TRUE))

resident_ref <- data.frame(resident_ref = tapply(rowSums(look_dfm_2LL[, "RESIDENT"] > (0)), docvars(look_dfm_2LL)$year, sum, na.rm = TRUE))

people_ref <- data.frame(people_ref = tapply(rowSums(look_dfm_2LL[, "PEOPLE"] > (0)), docvars(look_dfm_2LL)$year, sum, na.rm = TRUE))

economic_ref <- data.frame(economic_ref = tapply(rowSums(look_dfm_2LL[, "ECONOMIC"] > (0)), docvars(look_dfm_2LL)$year, sum, na.rm = TRUE))

criminals_ref <- data.frame(criminals_ref = tapply(rowSums(look_dfm_2LL[, "CRIMINALS"] > (0)), docvars(look_dfm_2LL)$year, sum, na.rm = TRUE))

protesters_ref <- data.frame(protesters_ref = tapply(rowSums(look_dfm_2LL[, "PROTESTERS"] > (0)), docvars(look_dfm_2LL)$year, sum, na.rm = TRUE))

believers_ref <- data.frame(believers_ref = tapply(rowSums(look_dfm_2LL[, "BELIEVERS"] > (0)), docvars(look_dfm_2LL)$year, sum, na.rm = TRUE))

refugee_ref <- data.frame(refugee_ref = tapply(rowSums(look_dfm_2LL[, "REFUGEE"] > (0)), docvars(look_dfm_2LL)$year, sum, na.rm = TRUE))

poor_ref <- data.frame(poor_ref = tapply(rowSums(look_dfm_2LL[, "POOR"] > (0)), docvars(look_dfm_2LL)$year, sum, na.rm = TRUE))

insurgents_ref <- data.frame(insurgents_ref = tapply(rowSums(look_dfm_2LL[, "INSURGENTS"] > (0)), docvars(look_dfm_2LL)$year, sum, na.rm = TRUE))

passive_ref <- data.frame(passive_ref = tapply(rowSums(look_dfm_2LL[, "PASSIVE"] > (0)), docvars(look_dfm_2LL)$year, sum, na.rm = TRUE))






crisis_crowd_ref <- data.frame(crisis_crowd_ref = tapply(rowSums(look_cdfm_2LL[, "CROWD"] > (0)), docvars(look_cdfm_2LL)$year, sum, na.rm = TRUE))

crisis_victims_ref <- data.frame(crisis_victims_ref = tapply(rowSums(look_cdfm_2LL[, "VICTIMS"] > (0)), docvars(look_cdfm_2LL)$year, sum, na.rm = TRUE))

crisis_citizen_ref <- data.frame(crisis_citizen_ref = tapply(rowSums(look_cdfm_2LL[, "CITIZEN"] > (0)), docvars(look_cdfm_2LL)$year, sum, na.rm = TRUE))

crisis_resident_ref <- data.frame(crisis_resident_ref = tapply(rowSums(look_cdfm_2LL[, "RESIDENT"] > (0)), docvars(look_cdfm_2LL)$year, sum, na.rm = TRUE))

crisis_people_ref <- data.frame(crisis_people_ref = tapply(rowSums(look_cdfm_2LL[, "PEOPLE"] > (0)), docvars(look_cdfm_2LL)$year, sum, na.rm = TRUE))

crisis_economic_ref <- data.frame(crisis_economic_ref = tapply(rowSums(look_cdfm_2LL[, "ECONOMIC"] > (0)), docvars(look_cdfm_2LL)$year, sum, na.rm = TRUE))

crisis_criminals_ref <- data.frame(crisis_criminals_ref = tapply(rowSums(look_cdfm_2LL[, "CRIMINALS"] > (0)), docvars(look_cdfm_2LL)$year, sum, na.rm = TRUE))

crisis_protesters_ref <- data.frame(crisis_protesters_ref = tapply(rowSums(look_cdfm_2LL[, "PROTESTERS"] > (0)), docvars(look_cdfm_2LL)$year, sum, na.rm = TRUE))

crisis_believers_ref <- data.frame(crisis_believers_ref = tapply(rowSums(look_cdfm_2LL[, "BELIEVERS"] > (0)), docvars(look_cdfm_2LL)$year, sum, na.rm = TRUE))

crisis_refugee_ref <- data.frame(crisis_refugee_ref = tapply(rowSums(look_cdfm_2LL[, "REFUGEE"] > (0)), docvars(look_cdfm_2LL)$year, sum, na.rm = TRUE))

crisis_poor_ref <- data.frame(crisis_poor_ref = tapply(rowSums(look_cdfm_2LL[, "POOR"] > (0)), docvars(look_cdfm_2LL)$year, sum, na.rm = TRUE))

crisis_insurgents_ref <- data.frame(crisis_insurgents_ref = tapply(rowSums(look_cdfm_2LL[, "INSURGENTS"] > (0)), docvars(look_cdfm_2LL)$year, sum, na.rm = TRUE))

crisis_passive_ref <- data.frame(crisis_passive_ref = tapply(rowSums(look_cdfm_2LL[, "PASSIVE"] > (0)), docvars(look_cdfm_2LL)$year, sum, na.rm = TRUE))





noncrisis_crowd_ref <- data.frame(noncrisis_crowd_ref = tapply(rowSums(look_ndfm_2LL[, "CROWD"] > (0)), docvars(look_ndfm_2LL)$year, sum, na.rm = TRUE))

noncrisis_victims_ref <- data.frame(noncrisis_victims_ref = tapply(rowSums(look_ndfm_2LL[, "VICTIMS"] > (0)), docvars(look_ndfm_2LL)$year, sum, na.rm = TRUE))

noncrisis_citizen_ref <- data.frame(noncrisis_citizen_ref = tapply(rowSums(look_ndfm_2LL[, "CITIZEN"] > (0)), docvars(look_ndfm_2LL)$year, sum, na.rm = TRUE))

noncrisis_resident_ref <- data.frame(noncrisis_resident_ref = tapply(rowSums(look_ndfm_2LL[, "RESIDENT"] > (0)), docvars(look_ndfm_2LL)$year, sum, na.rm = TRUE))

noncrisis_people_ref <- data.frame(noncrisis_people_ref = tapply(rowSums(look_ndfm_2LL[, "PEOPLE"] > (0)), docvars(look_ndfm_2LL)$year, sum, na.rm = TRUE))

noncrisis_economic_ref <- data.frame(noncrisis_economic_ref = tapply(rowSums(look_ndfm_2LL[, "ECONOMIC"] > (0)), docvars(look_ndfm_2LL)$year, sum, na.rm = TRUE))

noncrisis_criminals_ref <- data.frame(noncrisis_criminals_ref = tapply(rowSums(look_ndfm_2LL[, "CRIMINALS"] > (0)), docvars(look_ndfm_2LL)$year, sum, na.rm = TRUE))

noncrisis_protesters_ref <- data.frame(noncrisis_protesters_ref = tapply(rowSums(look_ndfm_2LL[, "PROTESTERS"] > (0)), docvars(look_ndfm_2LL)$year, sum, na.rm = TRUE))

noncrisis_believers_ref <- data.frame(noncrisis_believers_ref = tapply(rowSums(look_ndfm_2LL[, "BELIEVERS"] > (0)), docvars(look_ndfm_2LL)$year, sum, na.rm = TRUE))

noncrisis_refugee_ref <- data.frame(noncrisis_refugee_ref = tapply(rowSums(look_ndfm_2LL[, "REFUGEE"] > (0)), docvars(look_ndfm_2LL)$year, sum, na.rm = TRUE))

noncrisis_poor_ref <- data.frame(noncrisis_poor_ref = tapply(rowSums(look_ndfm_2LL[, "POOR"] > (0)), docvars(look_ndfm_2LL)$year, sum, na.rm = TRUE))

noncrisis_insurgents_ref <- data.frame(noncrisis_insurgents_ref = tapply(rowSums(look_ndfm_2LL[, "INSURGENTS"] > (0)), docvars(look_ndfm_2LL)$year, sum, na.rm = TRUE))

noncrisis_passive_ref <- data.frame(noncrisis_passive_ref = tapply(rowSums(look_ndfm_2LL[, "PASSIVE"] > (0)), docvars(look_ndfm_2LL)$year, sum, na.rm = TRUE))






corpus_size <- data.frame(corpus_size = tapply(rowSums(look_dfm_2LLL[, 1] > (-1)), docvars(look_dfm_2LLL)$year, sum, na.rm = TRUE))

anypeople_ref <- data.frame(anypeople_ref = tapply(rowSums(look_dfm_2LLL[, 1] > (0)), docvars(look_dfm_2LLL)$year, sum, na.rm = TRUE))

crisis_corpus_size <- data.frame(crisis_corpus_size = tapply(rowSums(look_cdfm_2LLL[, 1] > (-1)), docvars(look_cdfm_2LLL)$year, sum, na.rm = TRUE))

crisis_anypeople_ref <- data.frame(crisis_anypeople_ref = tapply(rowSums(look_cdfm_2LLL[, 1] > (0)), docvars(look_cdfm_2LLL)$year, sum, na.rm = TRUE))

noncrisis_corpus_size <- data.frame(noncrisis_corpus_size = tapply(rowSums(look_ndfm_2LLL[, 1] > (-1)), docvars(look_ndfm_2LLL)$year, sum, na.rm = TRUE))

noncrisis_anypeople_ref <- data.frame(noncrisis_anypeople_ref = tapply(rowSums(look_ndfm_2LLL[, 1] > (0)), docvars(look_ndfm_2LLL)$year, sum, na.rm = TRUE))

year_df$corpus_size <- corpus_size[match(year_df$year, rownames(corpus_size)), "corpus_size"]
year_df$anypeople_ref <- anypeople_ref[match(year_df$year, rownames(anypeople_ref)), "anypeople_ref"]
year_df$crisis_corpus_size <- crisis_corpus_size[match(year_df$year, rownames(crisis_corpus_size)), "crisis_corpus_size"]
year_df$crisis_anypeople_ref <- crisis_anypeople_ref[match(year_df$year, rownames(crisis_anypeople_ref)), "crisis_anypeople_ref"]
year_df$noncrisis_corpus_size <- noncrisis_corpus_size[match(year_df$year, rownames(noncrisis_corpus_size)), "noncrisis_corpus_size"]
year_df$noncrisis_anypeople_ref <- noncrisis_anypeople_ref[match(year_df$year, rownames(noncrisis_anypeople_ref)), "noncrisis_anypeople_ref"]

year_df$crowd_ref <- crowd_ref$crowd_ref[match(year_df$year, rownames(crowd_ref))]
year_df$victims_ref <- victims_ref$victims_ref[match(year_df$year, rownames(victims_ref))]
year_df$citizen_ref <- citizen_ref$citizen_ref[match(year_df$year, rownames(citizen_ref))]
year_df$resident_ref <- resident_ref$resident_ref[match(year_df$year, rownames(resident_ref))]
year_df$people_ref <- people_ref$people_ref[match(year_df$year, rownames(people_ref))]
year_df$economic_ref <- economic_ref$economic_ref[match(year_df$year, rownames(economic_ref))]
year_df$criminals_ref <- criminals_ref$criminals_ref[match(year_df$year, rownames(criminals_ref))]
year_df$protesters_ref <- protesters_ref$protesters_ref[match(year_df$year, rownames(protesters_ref))]
year_df$believers_ref <- believers_ref$believers_ref[match(year_df$year, rownames(believers_ref))]
year_df$refugee_ref <- refugee_ref$refugee_ref[match(year_df$year, rownames(refugee_ref))]
year_df$poor_ref <- poor_ref$poor_ref[match(year_df$year, rownames(poor_ref))]
year_df$insurgents_ref <- insurgents_ref$insurgents_ref[match(year_df$year, rownames(insurgents_ref))]
year_df$passive_ref <- passive_ref$passive_ref[match(year_df$year, rownames(passive_ref))]

year_df$crisis_crowd_ref <- crisis_crowd_ref$crisis_crowd_ref[match(year_df$year, rownames(crisis_crowd_ref))]
year_df$crisis_victims_ref <- crisis_victims_ref$crisis_victims_ref[match(year_df$year, rownames(crisis_victims_ref))]
year_df$crisis_citizen_ref <- crisis_citizen_ref$crisis_citizen_ref[match(year_df$year, rownames(crisis_citizen_ref))]
year_df$crisis_resident_ref <- crisis_resident_ref$crisis_resident_ref[match(year_df$year, rownames(crisis_resident_ref))]
year_df$crisis_people_ref <- crisis_people_ref$crisis_people_ref[match(year_df$year, rownames(crisis_people_ref))]
year_df$crisis_economic_ref <- crisis_economic_ref$crisis_economic_ref[match(year_df$year, rownames(crisis_economic_ref))]
year_df$crisis_criminals_ref <- crisis_criminals_ref$crisis_criminals_ref[match(year_df$year, rownames(crisis_criminals_ref))]
year_df$crisis_protesters_ref <- crisis_protesters_ref$crisis_protesters_ref[match(year_df$year, rownames(crisis_protesters_ref))]
year_df$crisis_believers_ref <- crisis_believers_ref$crisis_believers_ref[match(year_df$year, rownames(crisis_believers_ref))]
year_df$crisis_refugee_ref <- crisis_refugee_ref$crisis_refugee_ref[match(year_df$year, rownames(crisis_refugee_ref))]
year_df$crisis_poor_ref <- crisis_poor_ref$crisis_poor_ref[match(year_df$year, rownames(crisis_poor_ref))]
year_df$crisis_insurgents_ref <- crisis_insurgents_ref$crisis_insurgents_ref[match(year_df$year, rownames(crisis_insurgents_ref))]
year_df$crisis_passive_ref <- crisis_passive_ref$crisis_passive_ref[match(year_df$year, rownames(crisis_passive_ref))]

year_df$noncrisis_crowd_ref <- noncrisis_crowd_ref$noncrisis_crowd_ref[match(year_df$year, rownames(noncrisis_crowd_ref))]
year_df$noncrisis_victims_ref <- noncrisis_victims_ref$noncrisis_victims_ref[match(year_df$year, rownames(noncrisis_victims_ref))]
year_df$noncrisis_citizen_ref <- noncrisis_citizen_ref$noncrisis_citizen_ref[match(year_df$year, rownames(noncrisis_citizen_ref))]
year_df$noncrisis_resident_ref <- noncrisis_resident_ref$noncrisis_resident_ref[match(year_df$year, rownames(noncrisis_resident_ref))]
year_df$noncrisis_people_ref <- noncrisis_people_ref$noncrisis_people_ref[match(year_df$year, rownames(noncrisis_people_ref))]
year_df$noncrisis_economic_ref <- noncrisis_economic_ref$noncrisis_economic_ref[match(year_df$year, rownames(noncrisis_economic_ref))]
year_df$noncrisis_criminals_ref <- noncrisis_criminals_ref$noncrisis_criminals_ref[match(year_df$year, rownames(noncrisis_criminals_ref))]
year_df$noncrisis_protesters_ref <- noncrisis_protesters_ref$noncrisis_protesters_ref[match(year_df$year, rownames(noncrisis_protesters_ref))]
year_df$noncrisis_believers_ref <- noncrisis_believers_ref$noncrisis_believers_ref[match(year_df$year, rownames(noncrisis_believers_ref))]
year_df$noncrisis_refugee_ref <- noncrisis_refugee_ref$noncrisis_refugee_ref[match(year_df$year, rownames(noncrisis_refugee_ref))]
year_df$noncrisis_poor_ref <- noncrisis_poor_ref$noncrisis_poor_ref[match(year_df$year, rownames(noncrisis_poor_ref))]
year_df$noncrisis_insurgents_ref <- noncrisis_insurgents_ref$noncrisis_insurgents_ref[match(year_df$year, rownames(noncrisis_insurgents_ref))]
year_df$noncrisis_passive_ref <- noncrisis_passive_ref$noncrisis_passive_ref[match(year_df$year, rownames(noncrisis_passive_ref))]

year_df$crisis_crowd_rfreq <- year_df$crisis_crowd_ref / year_df$crisis_corpus_size
year_df$crisis_victims_rfreq <- year_df$crisis_victims_ref / year_df$crisis_corpus_size
year_df$crisis_citizen_rfreq <- year_df$crisis_citizen_ref / year_df$crisis_corpus_size
year_df$crisis_resident_rfreq <- year_df$crisis_resident_ref / year_df$crisis_corpus_size
year_df$crisis_people_rfreq <- year_df$crisis_people_ref / year_df$crisis_corpus_size
year_df$crisis_economic_rfreq <- year_df$crisis_economic_ref / year_df$crisis_corpus_size
year_df$crisis_criminals_rfreq <- year_df$crisis_criminals_ref / year_df$crisis_corpus_size
year_df$crisis_protesters_rfreq <- year_df$crisis_protesters_ref / year_df$crisis_corpus_size
year_df$crisis_believers_rfreq <- year_df$crisis_believers_ref / year_df$crisis_corpus_size
year_df$crisis_refugee_rfreq <- year_df$crisis_refugee_ref / year_df$crisis_corpus_size
year_df$crisis_poor_rfreq <- year_df$crisis_poor_ref / year_df$crisis_corpus_size
year_df$crisis_insurgents_rfreq <- year_df$crisis_insurgents_ref / year_df$crisis_corpus_size
year_df$crisis_passive_rfreq <- year_df$crisis_passive_ref / year_df$crisis_corpus_size

year_df$noncrisis_crowd_rfreq <- year_df$noncrisis_crowd_ref / year_df$noncrisis_corpus_size
year_df$noncrisis_victims_rfreq <- year_df$noncrisis_victims_ref / year_df$noncrisis_corpus_size
year_df$noncrisis_citizen_rfreq <- year_df$noncrisis_citizen_ref / year_df$noncrisis_corpus_size
year_df$noncrisis_resident_rfreq <- year_df$noncrisis_resident_ref / year_df$noncrisis_corpus_size
year_df$noncrisis_people_rfreq <- year_df$noncrisis_people_ref / year_df$noncrisis_corpus_size
year_df$noncrisis_economic_rfreq <- year_df$noncrisis_economic_ref / year_df$noncrisis_corpus_size
year_df$noncrisis_criminals_rfreq <- year_df$noncrisis_criminals_ref / year_df$noncrisis_corpus_size
year_df$noncrisis_protesters_rfreq <- year_df$noncrisis_protesters_ref / year_df$noncrisis_corpus_size
year_df$noncrisis_believers_rfreq <- year_df$noncrisis_believers_ref / year_df$noncrisis_corpus_size
year_df$noncrisis_refugee_rfreq <- year_df$noncrisis_refugee_ref / year_df$noncrisis_corpus_size
year_df$noncrisis_poor_rfreq <- year_df$noncrisis_poor_ref / year_df$noncrisis_corpus_size
year_df$noncrisis_insurgents_rfreq <- year_df$noncrisis_insurgents_ref / year_df$noncrisis_corpus_size
year_df$noncrisis_passive_rfreq <- year_df$noncrisis_passive_ref / year_df$noncrisis_corpus_size

year_df$crowd_rfreq <- year_df$crowd_ref / year_df$corpus_size
year_df$victims_rfreq <- year_df$victims_ref / year_df$corpus_size
year_df$citizen_rfreq <- year_df$citizen_ref / year_df$corpus_size
year_df$resident_rfreq <- year_df$resident_ref / year_df$corpus_size
year_df$people_rfreq <- year_df$people_ref / year_df$corpus_size
year_df$economic_rfreq <- year_df$economic_ref / year_df$corpus_size
year_df$criminals_rfreq <- year_df$criminals_ref / year_df$corpus_size
year_df$protesters_rfreq <- year_df$protesters_ref / year_df$corpus_size
year_df$believers_rfreq <- year_df$believers_ref / year_df$corpus_size
year_df$refugee_rfreq <- year_df$refugee_ref / year_df$corpus_size
year_df$poor_rfreq <- year_df$poor_ref / year_df$corpus_size
year_df$insurgents_rfreq <- year_df$insurgents_ref / year_df$corpus_size
year_df$passive_rfreq <- year_df$passive_ref / year_df$corpus_size


year_df$crisis_crowd_rprop <- year_df$crisis_crowd_ref / year_df$crisis_anypeople_ref
year_df$crisis_victims_rprop <- year_df$crisis_victims_ref / year_df$crisis_anypeople_ref
year_df$crisis_citizen_rprop <- year_df$crisis_citizen_ref / year_df$crisis_anypeople_ref
year_df$crisis_resident_rprop <- year_df$crisis_resident_ref / year_df$crisis_anypeople_ref
year_df$crisis_people_rprop <- year_df$crisis_people_ref / year_df$crisis_anypeople_ref
year_df$crisis_economic_rprop <- year_df$crisis_economic_ref / year_df$crisis_anypeople_ref
year_df$crisis_criminals_rprop <- year_df$crisis_criminals_ref / year_df$crisis_anypeople_ref
year_df$crisis_protesters_rprop <- year_df$crisis_protesters_ref / year_df$crisis_anypeople_ref
year_df$crisis_believers_rprop <- year_df$crisis_believers_ref / year_df$crisis_anypeople_ref
year_df$crisis_refugee_rprop <- year_df$crisis_refugee_ref / year_df$crisis_anypeople_ref
year_df$crisis_poor_rprop <- year_df$crisis_poor_ref / year_df$crisis_anypeople_ref
year_df$crisis_insurgents_rprop <- year_df$crisis_insurgents_ref / year_df$crisis_anypeople_ref
year_df$crisis_passive_rprop <- year_df$crisis_passive_ref / year_df$crisis_anypeople_ref

year_df$noncrisis_crowd_rprop <- year_df$noncrisis_crowd_ref / year_df$noncrisis_anypeople_ref
year_df$noncrisis_victims_rprop <- year_df$noncrisis_victims_ref / year_df$noncrisis_anypeople_ref
year_df$noncrisis_citizen_rprop <- year_df$noncrisis_citizen_ref / year_df$noncrisis_anypeople_ref
year_df$noncrisis_resident_rprop <- year_df$noncrisis_resident_ref / year_df$noncrisis_anypeople_ref
year_df$noncrisis_people_rprop <- year_df$noncrisis_people_ref / year_df$noncrisis_anypeople_ref
year_df$noncrisis_economic_rprop <- year_df$noncrisis_economic_ref / year_df$noncrisis_anypeople_ref
year_df$noncrisis_criminals_rprop <- year_df$noncrisis_criminals_ref / year_df$noncrisis_anypeople_ref
year_df$noncrisis_protesters_rprop <- year_df$noncrisis_protesters_ref / year_df$noncrisis_anypeople_ref
year_df$noncrisis_believers_rprop <- year_df$noncrisis_believers_ref / year_df$noncrisis_anypeople_ref
year_df$noncrisis_refugee_rprop <- year_df$noncrisis_refugee_ref / year_df$noncrisis_anypeople_ref
year_df$noncrisis_poor_rprop <- year_df$noncrisis_poor_ref / year_df$noncrisis_anypeople_ref
year_df$noncrisis_insurgents_rprop <- year_df$noncrisis_insurgents_ref / year_df$noncrisis_anypeople_ref
year_df$noncrisis_passive_rprop <- year_df$noncrisis_passive_ref / year_df$noncrisis_anypeople_ref

year_df$crowd_rprop <- year_df$crowd_ref / year_df$anypeople_ref
year_df$victims_rprop <- year_df$victims_ref / year_df$anypeople_ref
year_df$citizen_rprop <- year_df$citizen_ref / year_df$anypeople_ref
year_df$resident_rprop <- year_df$resident_ref / year_df$anypeople_ref
year_df$people_rprop <- year_df$people_ref / year_df$anypeople_ref
year_df$economic_rprop <- year_df$economic_ref / year_df$anypeople_ref
year_df$criminals_rprop <- year_df$criminals_ref / year_df$anypeople_ref
year_df$protesters_rprop <- year_df$protesters_ref / year_df$anypeople_ref
year_df$believers_rprop <- year_df$believers_ref / year_df$anypeople_ref
year_df$refugee_rprop <- year_df$refugee_ref / year_df$anypeople_ref
year_df$poor_rprop <- year_df$poor_ref / year_df$anypeople_ref
year_df$insurgents_rprop <- year_df$insurgents_ref / year_df$anypeople_ref
year_df$passive_rprop <- year_df$passive_ref / year_df$anypeople_ref


year_df$anypeople_rfreq <- year_df$anypeople_ref / year_df$corpus_size
year_df$crisis_anypeople_rfreq <- year_df$crisis_anypeople_ref / year_df$crisis_corpus_size
year_df$noncrisis_anypeople_rfreq <- year_df$noncrisis_anypeople_ref / year_df$noncrisis_corpus_size

year_df2 <- pivot_longer(year_df, cols = c("crisis_anypeople_rfreq", "noncrisis_anypeople_rfreq"))

year_ldf <- pivot_longer(year_df, cols = names(year_df[47:85]))

year_ldf2 <- pivot_longer(year_df, cols = names(year_df[89:127]))



year_ldf$corpus <- ifelse(year_ldf$name %in% names(year_df)[47:59], "crisis",
  ifelse(year_ldf$name %in% names(year_df)[60:72], "noncrisis", "any")
)
year_ldf$wordfield <- str_remove_all(year_ldf$name, "crisis_|noncrisis_|_rfreq")

year_ldf$wordfield_o <- factor(year_ldf$wordfield, levels = c("people", "economic", "crowd", "citizen", "resident", "passive", "criminals", "victims", "insurgents", "refugee", "poor", "protesters", "believers"), ordered = TRUE)


year_ldf2$corpus <- ifelse(year_ldf2$name %in% names(year_df)[89:101], "crisis",
  ifelse(year_ldf2$name %in% names(year_df)[102:114], "noncrisis", "any")
)
year_ldf2$wordfield <- str_remove_all(year_ldf2$name, "crisis_|noncrisis_|_rprop")

year_ldf2$wordfield_o <- factor(year_ldf2$wordfield, levels = c("people", "economic", "crowd", "citizen", "resident", "passive", "criminals", "victims", "insurgents", "refugee", "poor", "protesters", "believers"), ordered = TRUE)



breaks_log10 <- function(x) {
  low <- floor(log10(min(x)))
  high <- ceiling(log10(max(x)))

  10^(seq.int(low, high))
}

plot_cnc_wfref_1785_2020 <- ggplot(subset(year_ldf, corpus != "any"), aes(y = 100 * value, x = year, shape = corpus, color = corpus, fill = corpus, group = corpus)) +
  geom_point(size = 0.25) +
  geom_smooth() +
  theme_soft() +
  facet_wrap(. ~ wordfield_o, scale = "free_y") +
  ylab("Share of articles that include one or more references to...") +
  xlab("Year") +
  scale_x_continuous(minor_breaks = seq(1780, 2020, 20)) +
  scale_color_viridis_d(end = .75) +
  scale_fill_viridis_d(end = .75)

plot_cnc_wfref_1785_2020 <- ggplot(subset(year_ldf, corpus != "any"), aes(y = 100 * value, x = year, shape = corpus, color = corpus, fill = corpus, group = corpus)) +
  geom_smooth() +
  theme_soft() +
  facet_wrap(. ~ wordfield_o, scale = "free_y") +
  ylab("Share of articles that include one or more references to...") +
  xlab("Year") +
  scale_x_continuous(minor_breaks = seq(1780, 2020, 20)) +
  scale_color_viridis_d(end = .75) +
  scale_fill_viridis_d(end = .75)

ggsave(plot_cnc_wfref_1785_2020, file = "plot_cnc_wfref_1785+2020.svg", unit = "cm", width = 16, height = 8, dpi = 1200, scale = 1.5)

plot_cnc_wfprop_1785_2020 <- ggplot(subset(year_ldf2, corpus != "any"), aes(y = 100 * value, x = year, shape = corpus, color = corpus, fill = corpus, group = corpus)) +
  geom_smooth() +
  theme_soft() +
  facet_wrap(. ~ wordfield_o, scale = "free_y") +
  ylab("Share of articles that refer to common people that include one or more references to...") +
  xlab("Year") +
  scale_x_continuous(minor_breaks = seq(1780, 2020, 20)) +
  scale_color_viridis_d(end = .75) +
  scale_fill_viridis_d(end = .75)

ggsave(plot_cnc_wfprop_1785_2020, file = "plot_cnc_wfprop_1785+2020.svg", unit = "cm", width = 16, height = 8, dpi = 1200, scale = 1.5)



t.test(year_df$crisis_anypeople_rfreq, year_df$noncrisis_anypeople_rfreq, paired = TRUE)

fulldata <- (year_df[!is.na(year_df$noncrisis_anypeople_rfreq) & !is.na(year_df$crisis_anypeople_rfreq), c("noncrisis_anypeople_rfreq", "crisis_anypeople_rfreq")])



fulldata %>% cohens_d(noncrisis_anypeople_rfreq ~ crisis_anypeople_rfreq, paired = TRUE)

cohens_d(data = year_df2, value ~ name, paired = TRUE)

ggplot(year_df2, aes(y = value * 100, x = year, group = name, fill = name, color = name, shape = name)) +
  geom_point() +
  geom_smooth() +
  theme_soft()

round(100 * with(year_ldf, tapply(value, wordfield, mean, na.rm = TRUE)[order(tapply(value, wordfield, mean, na.rm = TRUE))]), 2)




plot_anyref_1785_2020 <- ggplot(year_df, aes(y = 100 * anypeople_rfreq, x = year, color = "1", fill = "1")) +
  geom_point() +
  geom_smooth(span = .25) +
  theme_soft() +
  scale_color_viridis_d() +
  scale_fill_viridis_d() +
  theme(legend.position = "none") +
  ylab("Share of articles with any references to <i>common people</i>") +
  xlab("Year") +
  ylim(0, 100) +
  scale_x_continuous(minor_breaks = seq(1780, 2020, 10))

ggsave(plot_anyref_1785_2020, file = "plot_anyref_1785+2020.svg", unit = "cm", width = 16, height = 8, dpi = 1200, scale = 1.5)










people_fit_1900_r1 <- LSX::textmodel_lss(x = lsx_1900, seeds = people_words_r1)
people_fit_1950_r1 <- LSX::textmodel_lss(x = lsx_1950, seeds = people_words_r1)
people_fit_1980_r1 <- LSX::textmodel_lss(x = lsx_1980, seeds = people_words_r1)
people_fit_2000_r1 <- LSX::textmodel_lss(x = lsx_2000, seeds = people_words_r1)
people_fit_2020_r1 <- LSX::textmodel_lss(x = lsx_2020, seeds = people_words_r1)


people_fit_1900_r2n <- LSX::textmodel_lss(x = lsx_1900, seeds = people_words_r2n)
people_fit_1950_r2n <- LSX::textmodel_lss(x = lsx_1950, seeds = people_words_r2n)
people_fit_1980_r2n <- LSX::textmodel_lss(x = lsx_1980, seeds = people_words_r2n)
people_fit_2000_r2n <- LSX::textmodel_lss(x = lsx_2000, seeds = people_words_r2n)
people_fit_2020_r2n <- LSX::textmodel_lss(x = lsx_2020, seeds = people_words_r2n)


lsx_50k <- dfm_sample(lsx_dtm, size = 50000, replace = FALSE, prob = NULL, by = NULL)




timeref_sample_1000 <- (kwic_nc_1000)[sample(1:27520, 1000), ]

out_texts <- paste(timeref_sample_1000$pre, "**********", str_to_upper(timeref_sample_1000$keyword), "**********", timeref_sample_1000$post, "\n\n\n\n\n")

out_wrapped <- strwrap(out_texts, 100)

write.table(out_wrapped, file = "timeref_sample_1000.txt", sep = " ", row.names = FALSE, col.names = FALSE)



for (i in 1:length(wave.texts10))
{
  for (j in 1:length(wave.texts10[[i]]))
  {
    wave.features <- paste(names(wave.feat[[i]]), (wave.feat[[i]]), collapse = " ")
    article.features <- paste(names(docvars(wave.texts10[[i]])), docvars(wave.texts10[[i]])[j, ], collapse = " ")
    article.text <- strwrap(paste(wave.texts[[i]][[j]], collapse = " "), 100)
    everything <- list(wave.features, article.features, paste(article.text, collapse = "\n"))
    everything <- paste(wave.features, "\n\n\n", article.features, "\n\n\n", paste(article.text, collapse = "\n"))
    filename <- paste0("wave", wave.feat[[i]]$id, ".", names(wave.texts[[i]])[[j]], ".txt")
  }
  print(i)
  flush.console()
}

4.4.3 Environment references

Code
docvars(t.dtm)$Decade <- 10*floor(as.numeric(docvars(t.dtm)$year)/10)

thetimes$Decade <- 10*floor(as.numeric(thetimes$year)/10)

docvars(t.dtm) %>%
    group_by(Decade,Corpus) %>%
        summarise(n=n()) -> dfd.corpus

n.full <- thetimes %>% group_by(Decade) %>% summarise(articles=sum(articles))

dfd.corpus$n.full <- (n.full[match(dfd.corpus$Decade,n.full$Decade),2])[[1]]



lsx_dtm <- dfm_trim(t.dtm,min_docfreq=100,docfreq_type="count")

docvars(lsx_dtm)$nature <- (df.nature$nature_salience>50)

lsx_nature <- dfm_subset(lsx_dtm,nature==TRUE)

lsx_50k <- dfm_sample(lsx_dtm, size = 50000, replace = FALSE, prob = NULL, by = NULL)

economy_words_r1 <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("econom*", "business*", "financ*"))
life_words_r1 <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("dead*", "death*", "life*", "live*", "wound*", "poison*", "injur*", "kill*", "murder*", "casualt*"))
politics_words_r1 <- LSX::textmodel_lss(x=t.dtm,seeds=c("polit*", "state*", "govern*", "execut*", "parlia*", "represent*", "minist*", "administ*", "murder*", "casualt*"))





analytical.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("ecosystem*", "wildlife", "biodiversity", "biosecurity"))

landscape.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("woodland*", "forest*", "landscape*", "hillside*"))

environment.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("soil*", "environment", "environments", "environmental", "habitat*", "nature", "ecological*", "atmospher*"))

lifeform_general.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("species", "fung*", "flora", "fauna", "animal*", "specimen*"))

lifeform_plants.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("tree*", "elm", "larch", "ash", "oak*", "ulmus", "chestnut*", "botanic*", "vegetation*", "birch*", "flower", "flowers", "blossom*"))

lifeform_insects.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("beetle*", "insect*", "spider*"))

lifeform_microbes.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("microbial", "bacter*", "microorg*", "fung*"))

lifeform_parts.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("bark*", "foliag**", "leaf*"))

geospread.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("native", "invasive"))

reproduction.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("spore*", "seedling*", "sapling*", "pollinat*"))

bioprocesses.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("photosynthesis"))

evolution.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("extinct*", "evolution*"))

damage_illness.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("fraxinea", "pest*"))

damage_wound.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("lesion*", "wound*"))

threat_to_nature.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("dieback", "pollution*"))

threat_by_nature.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("volcan*", "flood*"))

human_extraction.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("firewood", "logs", "farm*"))

human_planning.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("hectares"))

human_domination.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c(("landowner*"))

human_treatment.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("forestry", "forester", "felling", "replanting", "planting", "hedgerow*", "planted", "treeplanting", "garden*", "pesticides", "field*"))

human_protection.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("conservation", "environmentalists", "ecologist*", "ecological*"))

human_awe.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("graceful"))

human_affected.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("pollinat*"))

seasonality.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("autumn*", "spring*", "summer*", "winter*", "flowering*"))

seasonality.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("autumn*", "spring*", "summer*", "winter*", "flowering*"))

seasonality.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("autumn*", "spring*", "summer*", "winter*", "flowering*"))

seasonality.nature.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("autumn*", "spring*", "summer*", "winter*", "flowering*"))

##### Round 2

birds.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("birds", "hummingbird", "parrot", "bat", "nest*", "breed*"))
water.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("dolphin*", "whale", "submarine", "ocean*", "marine", "fish"))
landscape.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("woodland*", "forest*", "landscape*", "hillside*", "steppe", "isle*", "island*", "crater", "canyon", "coast*", "seabed", "deep-sea", "ocean*", "woodland", "slope", "valley*", "stream*", "mountain", "estuary"))
mammal.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("cheetah*", "mammal*", "herds"))
insect.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("beetle*", "beatle*", "insect*", "butterfl*", "spider*", "fly", "flies", "mosquito*")) 
abundance.words <- LSX::textmodel_lss(x=lsx_dtm,seeds=c("vast", "expanse", "immeasurable", "space", "distance", "view", "vista", "perspective", "horizon", "abund*", "neverend*", "unend*", "unlimit*"))
extraction.words <- LSX::textmodel_lss(x=lsx_50k,seeds=c("firewood", "logs", "farm*", "resource*", "fisherman", "fishermen", "logging", "thanksgiving", "honey", "cowboy*", "seafood*", "catch", "hunt*", "pest*", "vegetabl*", "manure", "soil*", "sow*", "acre*", "agricult*", "compost*", "shading", "tomato*", "strawberr*", "broccoli", "fruit*", "cabbage", "grape*", "raspberr*", "apricot*", "slaughter*", "butcher*", "lamb*", "plough*", "crop*", "livestock*", "husbandr*"))
exploration.words <-  LSX::textmodel_lss(x=lsx_50k,seeds=c("discover*", "indomitable", "reveal*", "discover*", "treasur*", "secret*", "supernatur*", "tough*", "legend*", "adventur*", "climber*", "voyag*"))                   
humanperspective.words <-  LSX::textmodel_lss(x=lsx_50k,seeds=c("mankind", "naked", "humanity", "naturist", "naturalised", "human", "man-made", "premeditated", "genius", "wilderness"))                    
humandomination.words <- LSX::textmodel_lss(x=lsx_50k,seeds=c("hectares", "landowner*", "triumph*", "faculty", "ambition", "stately", "pesticides", "inoculat*", "medicin*", "antibiotics"))
humanprotection.words <- LSX::textmodel_lss(x=lsx_50k,seeds=c("conservation", "conservationist*", "environmentalist*", "ecologist*", "ecological*", "preservation", "preserv*", "sensitive", "guard", "valuabl*", "greenpeace", "ozone", "layer", "awareness"))
religion.words <- LSX::textmodel_lss(x=lsx_50k,seeds=c("eden", "god", "biblical", "theolog*", "dedicat*", "christ*", "religi*", "sermon", "anglican", "creator", "spirit*", "jesus", "angels", "illumination", "divine"))
primitive.words <- LSX::textmodel_lss(x=lsx_50k,seeds=c("primit*", "bushcraft*", "civilisation", "civilized", "civilised", "uncivilised", "uncivilized"))
rurality.words <- LSX::textmodel_lss(x=lsx_50k,seeds=c("countryside", "cottage", "farmhous*", "lodge", "village", "bonfire"))
building.words <- LSX::textmodel_lss(x=lsx_50k,seeds=c("waterfront", "architecture", "roads", "towering"))
                
places.words <-     LSX::textmodel_lss(x=lsx_50k,seeds=c("alaska", "antarctica", "arctic", "everest"))          



birds.words <- LSX::textmodel_lss(x=lsx_nature,seeds=c("birds", "hummingbird", "parrot", "bat", "nest*", "breed*"))
water.words <- LSX::textmodel_lss(x=lsx_nature,seeds=c("dolphin*", "whale", "submarine", "ocean*", "marine", "fish"))
landscape.words <- LSX::textmodel_lss(x=lsx_nature,seeds=c("woodland*", "forest*", "landscape*", "hillside*", "hill*", "steppe", "isle*", "island*", "crater", "canyon", "coast*", "seabed", "deep-sea", "ocean*", "woodland*", "slope*", "valley*", "stream*", "mountain*", "estuar*", "desert*"))
mammal.words <- LSX::textmodel_lss(x=lsx_nature,seeds=c("cheetah*", "mammal*", "herds"))
insect.words <- LSX::textmodel_lss(x=lsx_nature,seeds=c("beetle*", "beatle*", "insect*", "butterfl*", "spider*", "fly", "flies", "mosquito*"))  
abundance.words <- LSX::textmodel_lss(x=lsx_nature,seeds=c("vast", "expanse", "immeasurable", "space", "distance", "view", "vista", "perspective", "horizon", "abund*", "neverend*", "unend*", "unlimit*"))
extraction.words <- LSX::textmodel_lss(x=lsx_nature,seeds=c("firewood", "logs", "farm*", "resource*", "fisherman", "fishermen", "logging", "thanksgiving", "honey", "cowboy*", "seafood*", "catch", "hunt*", "pest*", "vegetabl*", "manure", "soil*", "sow*", "acre*", "agricult*", "compost*", "shading", "tomato*", "strawberr*", "broccoli", "fruit*", "cabbage", "grape*", "raspberr*", "apricot*", "slaughter*", "butcher*", "lamb*", "plough*", "crop*", "livestock*", "husbandr*", "potato", "cerea*", "turnip*", "weed*", "fallow*", "drought*", "soil*", "agricult*", "bush*", "pest*", "milk*", "pig*", "manur*", "fodder*", "pastur*", "pastor*", "swin*", "cow*", "beef*", "cattl*", "trawler*", "fisher*", "pipelin*", "refiner*", "cobalt*", "mining*", "miner*", "quarri*", "quarry*"))
exploration.words <-  LSX::textmodel_lss(x=lsx_nature,seeds=c("discover*", "indomitable", "reveal*", "discover*", "treasur*", "secret*", "tough*", "legend*", "adventur*", "climber*", "voyag*"))                   
humanperspective.words <-  LSX::textmodel_lss(x=lsx_nature,seeds=c("mankind", "naked", "humanity", "naturist", "naturalised", "human", "man-made", "premeditated", "genius", "wilderness", "barbar*", "utop*"))                 
humandomination.words <- LSX::textmodel_lss(x=lsx_nature,seeds=c("hectares", "landowner*", "triumph*", "faculty", "ambition", "stately", "pesticides", "inoculat*", "medicin*", "antibiotics"))
humanprotection.words <- LSX::textmodel_lss(x=lsx_nature,seeds=c("conservation", "conservationist*", "environmentalist*", "ecologist*", "ecological*", "preservation", "preserv*", "sensitive", "guard", "valuabl*", "greenpeace", "ozone", "layer", "awareness", "recycl*", "pollut*", "radioact*", "seabed*", "energ*", "contain*", "methan*", "emission*", "reactor*", "self-suffic*", "warming*", "dioxid*", "spill*", "carbon*", "decarbon*", "gases", "greenhouse", "emission*", "concentrations", "bicarbonate", "deplet*", "emit*", "co2", "CO2"))
religion.words <- LSX::textmodel_lss(x=lsx_nature,seeds=c("eden", "god", "biblical", "theolog*", "dedicat*", "christ*", "religi*", "sermon", "anglican", "creator", "spirit*", "jesus", "angels", "illumination", "divine", "supernatur*"))
primitive.words <- LSX::textmodel_lss(x=lsx_nature,seeds=c("primit*", "bushcraft*", "civilisation", "civilized", "civilised", "uncivilised", "uncivilized"))
rurality.words <- LSX::textmodel_lss(x=lsx_nature,seeds=c("countryside", "cottage", "farmhous*", "lodge", "village", "bonfire"))
building.words <- LSX::textmodel_lss(x=lsx_nature,seeds=c("waterfront", "architecture", "roads", "towering"))
humantreatment.words <- LSX::textmodel_lss(x=lsx_nature,seeds=c("forestry", "forester", "felling", "replanting", "planting", "hedgerow*", "planted", "treeplanting", "garden*", "pesticides", "field*", "terrace*", "nitrogen*", "landfill"))               
lifeformland.words <- LSX::textmodel_lss(x=lsx_nature,seeds=c("cheetah*", "mammal*", "herds", "reindeer", "dinosaur*"))             
places.words <-     LSX::textmodel_lss(x=lsx_nature,seeds=c("alaska", "antarctica", "arctic", "everest"))   

evolution.words <- LSX::textmodel_lss(x=lsx_nature,seeds=c("extinct*", "ancient", "survival*", "origin*"))      

damageillness.words <- LSX::textmodel_lss(x=lsx_nature,seeds=c("fraxinea", "pest*", "toxic", "incubat*", "inoculat*", "disease", "illness", "infect*", "epidem*", "pandem*", "malady", "maladi*", "malaria", "virulence", "virus", "viral*", "symptom*", "typh*", "antibiotic*", "anthrax*", "scab*", "foot-and-mouth", "outbreak*"))

myNature <- dictionary(list(
                        places=c("alaska", "antarctica", "everest", "ice", "greenland", "latitude*", "glacier*", "moon*", "planet*", "northerl*", "mercury", "venus", "uranus", "mars", "neptun*", "saturn", "jupiter", "pluto", "altitude*"),
                        tourism=c("trip*", "attraction*"),
                        climate=c("warming*", "*warming", "climat*", "*dioxide", "greenhous*", "emiss*", "emit*", "carbo*", "decarbo*", "methan*", "concentrations", "co2", "CO2"),
                        weather=c("weather*", "chill*", "downpour*", "rain", "temperatur*", "celsius*", "fahrenheit*", "warm*", "cold*", "windy*", "cloud*", "barometer*"),
                        analytical=c("ecosystem*", "wildlife", "biodiversity", "biosecurity", "element", "particles", "substance*", "genetic*"),
                        landscape=c("woodland*", "forest*", "landscape*", "hillside*", "steppe", "isle*", "island*", "crater", "canyon", "coast*", "seabed", "deep-sea", "ocean*", "woodland*", "slope*", "valley*", "stream*", "mountain*", "estuar*", "desert*", "ridg*", "surface*", "waterfall*"),
                        environment=c("soil*", "environment", "environments", "environmental", "habitat*", "natur*", "ecology", "ecological*", "atmospher*", "earth", "nitrogen*"),
                        lifeform_general=c("species", "fung*", "flora", "fauna", "animal*", "specimen*", "creatur*", "habitat*", "predator*", "organism*", "pest*", "parasite*"),
                        lifeform_plants=c("pine*", "oak*", "elm*", "plant*", "tree*", "elm", "larch", "ash", "oak*", "ulmus", "chestnut*", "botanic*", "vegetation*", "birch*", "flower*", "oak*", "grass", "blossom*", "root*", "weed*", "foliag*", "stem*", "berries", "berry", "bloom*", "herbaceous", "shrub*", "herb*", "perenn*", "apricot*", "hybrid*", "fragrant*", "anemon*", "bark", "leaf", "tomato*", "strawberr*", "broccoli", "azaleas", "crocus", "fruit*", "grape*", "cabbage", "roses", "lavender", "climbers", "raspberries", "narcissus", "apricot", "gladiol*", "rhodedendron*", "tulip*", "hyacinth*", "daffodil*", "sturd*", "bud*", "bush*"),
                        lifeform_land=c("cheetah*", "mammal*", "herds", "reindeer", "dinosaur*"),
                        lifeform_insects=c("beetle*", "beatle*", "insect*", "butterfl*", "spider*", "fly", "flies", "mosquito*"),
                        lifeform_microbes=c("microbial", "bacter*", "bacill*", "fung*", "microorg*"),
                        lifeform_birds=c("birds", "hummingbird", "parrot", "bat", "nest*", "breed*"),
                        lifeform_water=c("dolphin*", "whale", "submarine", "ocean*", "marine", "fish*", "water*", "underwater*", "sea*", "currents", "shark*", "turtle*"),
                        geospread=c("native", "invasive", "invad*", "intrud*", "indigen*", "autochth*", "introduc*", "endemic*"),
                        abundance=c("vast", "expanse", "immeasurable", "space", "distance", "view", "vista", "perspective", "horizon", "abund*", "neverend*", "unend*", "unlimit*", "cloud*", "strech*", "astronom*", "faint", "beyond", "reaches", "drear*", "ray*"),
                        reproduction=c("spore*", "seedling*", "sapling*", "pollinat*", "seed*"),
                        evolution=c("extinct*", "ancient", "survival*", "origin*", "specie*", "endanger*", "biodivers*", "populations", "captive", "captivity", "evolution*"),
                        damage_illness=c("fraxinea", "pest*", "toxic", "incubat*", "inoculat*", "diseas*", "illness", "infect*", "epidem*", "pandem*", "malady", "maladi*", "malaria", "virulence", "virus", "viral*", "symptom*", "typh*", "antibiotic*", "anthrax*", "scab*", "foot-and-mouth", "outbreak*", "viru*", "viral*", "antibod*", "epidemic*", "pandemic*", "spread*", "meningitis*", "incidenc*", "endemic*", "fever*", "transmit*", "suspect*", "vaccin*", "cases", "carrier*", "contract*", "veterinar*", "doctor*", "medicin*", "influenza*", "patients", "clinical*", "coronavir*", "pneumon*", "hiv", "smallpox", "sars", "flu*", "immun*", "plague*"),
                        damage_wound=c("lesion*", "bite*"),
                        threat_to_nature=c("dieback", "pollution*", "destruction", "fall-out", "chernobyl", "unmitigated", "radioactive", "dying", "intruder", "deadly", "devastat*", "spil*", "*spill", "lethal", "leaking", "contaminat*", " pesticides", "waste", "pollut*", "radiation", "apocalyp*"),
                        threat_by_nature=c("volcan*", "flood*", "earthquake*", "tragedies", "sinking", "storm", "gale", "tornado*", "hurrican*", "tsunam*", "destruction", "struck", "stricken", "wreck", "survivor", "forces", "preparedness", "dangerous", "radioactive", "discharges", "titanic", "dying", "deadly", "devastat*", "lethal", "drought*", "radiation", "parasit*", "pollinat*", "avalanch*", "pollen*", "infection*"),
                        seasonality=c("autumn*", "spring*", "summer*", "winter*", "flowering*", "thanksgiving"),
                        human_perspective=c("mankind", "naked", "humanity", "naturist", "naturalised", "human", "man-made", "premeditated", "genius", "wilderness", "barbar*", "utop*"),
                        human_extraction=c("firewood", "logs", "farm*", "resource*", "fisherman", "fishermen", "logging", "thanksgiving", "honey", "cowboy*", "seafood*", "catch", "hunt*", "pest*", "vegetabl*", "manure", "soil*", "sow*", "acre*", "agricult*", "compost*", "shading", "tomato*", "strawberr*", "broccoli", "fruit*", "cabbage", "grape*", "raspberr*", "apricot*", "slaughter*", "butcher*", "lamb*", "plough*", "crop*", "livestock*", "husbandr*", "potato", "cerea*", "turnip*", "weed*", "fallow*", "drought*", "soil*", "agricult*", "bush*", "pest*", "milk*", "pig*", "manur*", "fodder*", "pastur*", "pastor*", "swin*", "cow*", "beef*", "cattl*", "trawler*", "fisher*", "pipelin*", "refiner*", "cobalt*", "mining*", "miner*", "quarri*", "quarry*", "parsnip*", "ingredient*", "food*", "nourish*", "spinach*", "porridg*", "carrot*", "salad*", "onion*", "rake*", "mustard*", "mineral*", "sprout*", "artichok*", "herring*", "pudding*", "clover*", "sieve*", "cucumber*", "lettuc*", "cook*", "vinegar*", "cinnamon*", "cranberr*", "dish*", "soda*", "lemom*", "pepper*", "menu*", "recipe*", "protein*"),
                        human_gardening=c("azaleas", "crocus", "roses", "climbers", "lavender", "narciss*", "gladiol*", "rhodedendron*", "tulip*", "hyacinth*", "daffodil*", "sturd*", "bud*", "bush*"),
                        human_domination=c("hectares", "landowner*", "triumph*", "faculty", "ambition", "stately", "pesticid*", "inoculat*", "medicin*", "antibiotic*", "tractor", "chemotherap*", "biotechnolog*", "genetic*", "chemic*", "drug*", "monsanto", "cosmetic*"),
                        human_treatment=c("forestry", "forester", "felling", "replanting", "planting", "hedgerow*", "planted", "treeplanting", "garden*", "pesticides", "field*", "terrace*", "nitrogen*", "landfill", "planted", "spruc*", "replant*", "tuileries", "timber*", "trunk*", "clearing"),
                        human_protection=c("conservation", "conservationist*", "environmentalist*", "ecologist*", "ecological*", "preservation", "preserv*", "sensitive", "guard", "valuabl*", "greenpeace", "ozone", "layer", "awareness", "recycl*", "pollut*", "radioact*", "seabed*", "energ*", "contain*", "methan*", "emission*", "reactor*", "self-suffic*", "warming*", "dioxid*", "spill*", "carbon*", "decarbon*", "gases", "greenhouse", "emission*", "concentrations", "bicarbonate", "deplet*", "emit*", "co2", "CO2", "coexist*"),
                        human_exploration=c("discover*", "indomitable", "reveal*", "discover*", "treasur*", "secret*", "tough*", "legend*", "adventur*", "climber*", "voyag*", "peak*", "mountaineer*", "hero*"),
                        human_inquiry=c("scientist*", "geologist*", "geographer*", "biologist*"),
                        religion=c("eden", "god", "biblical", "theolog*", "dedicat*", "christ*", "religi*", "sermon", "anglican", "creator", "spirit*", "jesus", "angels", "illumination", "divine", "supernatur*", "credo*", "creed*", "bibl*", "spirit*", "creation*", "apocalyptic*"),
                        awe_inspire=c("imagination", "graceful", "beautiful", "wonderful", "incredible", "tallest", "inspir*", "poetry", "philosoph*", "remarkabl*", "experience", "intellectual*", "delight*", "perfect*", "picturesque", "gleaming", "panoramic", "towering"),
                        primitive=c("bushcraft*", "civilisation", "civilized", "civilised", "uncivilised", "uncivilized", "instinct*", "wilderness*"),
                        rurality=c("countryside", "cottage", "farmhous*", "lodge", "village", "bonfire", "idyll*")
                        ))

dfm.nature <- (dfm(lsx_dtm, dictionary = myNature))

feature_list <- dfm.nature@Dimnames$features

cortab <- cor(matrix(dfm.nature,ncol=35))

colnames(cortab) <- feature_list
rownames(cortab) <- feature_list

library(nFactors)
ev <- eigen(cortab) # get eigenvalues
ap <- parallel(subject=nrow(matrix(dfm.nature,ncol=35)),var=ncol(matrix(dfm.nature,ncol=35)),
  rep=100,cent=.05)
nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
plotnScree(nS)

pca_loadings <- matrix(principal(cortab,nfactors=10,rotate="varimax")$loadings[1:350],ncol=10)
colnames(pca_loadings) <- paste0("F",1:10)
rownames(pca_loadings) <- feature_list

stargazer(pca_loadings)

df.nature <- data.frame(as.matrix(dfm.nature))
df.nature$year <- docvars(dfm.nature)$year
df.nature$Corpus <- docvars(dfm.nature)$Corpus
df.nature$wordcount <- rowSums(lsx_dtm)

df.nature$nature <- with(df.nature,climate+analytical+landscape+environment+lifeform_general+lifeform_plants+lifeform_land+lifeform_insects+lifeform_microbes+lifeform_birds+lifeform_water+reproduction+bioprocesses+threat_to_nature+threat_by_nature+human_extraction+human_gardening+human_treatment+human_protection+rurality)


df.nature$descriptive <- df.nature$places+df.nature$landscape+df.nature$environment+df.nature$lifeform_birds+df.nature$lifeform_general+df.nature$lifeform_insects+df.nature$lifeform_land+df.nature$lifeform_microbes+df.nature$lifeform_plants+df.nature$lifeform_water

df.nature$analytical <- df.nature$analytical+df.nature$geospread+df.nature$reproduction+df.nature$evolution+df.nature$human_inquiry

df.nature$nature_power <- df.nature$threat_by_nature+df.nature$seasonality
df.nature$human_power <- df.nature$human_perspective+df.nature$human_extraction+df.nature$human_gardening+df.nature$human_domination+df.nature$human_treatment

df.nature$domination <- df.nature$climate+df.nature$human_protection+df.nature$threat_by_nature+df.nature$seasonality+df.nature$human_perspective+df.nature$human_extraction+df.nature$human_gardening+df.nature$human_domination+df.nature$human_treatment+df.nature$threat_to_nature

df.nature$inspiration <- df.nature$religion+df.nature$awe_inspire+df.nature$seasonality+df.nature$rurality+df.nature$human_exploration+df.nature$primitive

df.nature$protection <- df.nature$climate+df.nature$human_protection+df.nature$human_treatment+df.nature$threat_to_nature

df.nature$nature_salience <- 1000*df.nature$nature/df.nature$wordcount
df.nature$analytical_salience <- 1000*df.nature$analytical /df.nature$wordcount
df.nature$descriptive_salience <- 1000*df.nature$descriptive/df.nature$wordcount
# df.nature$domination_salience <- 1000*df.nature$domination/df.nature$wordcount
df.nature$nature_power_salience <- 1000*df.nature$domination/df.nature$wordcount
df.nature$human_power_salience <- 1000*df.nature$domination/df.nature$wordcount
df.nature$inspiration_salience <- 1000*df.nature$inspiration/df.nature$wordcount
df.nature$protection_salience <- 1000*df.nature$protection/df.nature$wordcount

cat_nature <-       c("weather", "climate", "analytical", "landscape", "environment", "lifeform_general", "lifeform_plants", "lifeform_land", "lifeform_insects", "lifeform_microbes", "lifeform_birds", "lifeform_water", "reproduction", "bioprocesses", "threat_to_nature", "threat_by_nature", "human_extraction", "human_gardening", "human_treatment", "human_protection", "rurality")
cat_descriptive <-  c("places", "landscape", "environment", "lifeform_birds", "lifeform_general", "lifeform_insects", "lifeform_land", "lifeform_microbes", "lifeform_plants", "lifeform_water")
cat_analytic <-     c("analytical", "geospread", "reproduction", "evolution", "human_inquiry")
cat_domination <-   c("weather", "climate", "human_protection", "threat_by_nature", "seasonality", "human_perspective", "human_extraction", "human_gardening", "human_domination", "human_treatment") # Seasonality is here, but secondary
    cat_nature_power <- c("threat_by_nature", "seasonality")
    cat_human_power <-  c("human_perspective", "human_extraction", "human_gardening", "human_domination", "human_treatment")
    cat_protection <-   c("climate", "human_protection", "human_treatment", "threat_to_nature", "weather")
cat_inspiration <-  c("religion", "awe_inspire", "seasonality", "rurality", "human_exploration", "primitive")


df.nature$Year <- as.numeric(df.nature$year)
df.nature$Decade <- floor(df.nature$Year/10)*10

ldf.nature <- pivot_longer(subset(df.nature,nature_salience>50),col=c("descriptive_salience", "analytical_salience", "nature_power_salience", "human_power_salience", "protection_salience", "inspiration_salience"))


ldf.cat_nature <- pivot_longer(subset(df.nature,nature_salience>50),col=all_of(cat_nature))
ldf.cat_nature$share <- ldf.cat_nature$value/ldf.cat_nature$wordcount

ldf.cat <- pivot_longer(subset(df.nature,nature_salience>50),col=all_of(c(cat_nature,cat_descriptive,cat_analytic,cat_nature_power,cat_human_power,cat_protection,cat_inspiration)))
ldf.cat$share <- ldf.cat$value/ldf.cat$wordcount
ldf.cat$Year <- as.numeric(ldf.cat$year)

df.nature_salience$
dfd.corpus$

ldf.cat$total <- ldf.cat


subset(ldf.cat,name%in%cat_nature) %>%
    group_by(Decade,Corpus,name) %>%
        summarise(share=sum(share)) %>%
            ggplot(aes(y=share,x=Decade,group=name,fill=name))+geom_col(position="fill",color="white")+
            theme_soft()+scale_fill_viridis_d()+facet_grid(Corpus~.) -> gg_cat_nature
            
gg_cat_nature       
        
subset(ldf.cat,name%in%cat_descriptive) %>%
    group_by(Decade,name) %>%
        summarise(share=sum(share)) %>%
            ggplot(aes(y=share,x=Decade,group=name,fill=name))+geom_col(position="fill",color="white")+
            theme_soft()+scale_fill_viridis_d()
            
subset(ldf.cat,name%in%cat_analytic) %>%
    group_by(Decade,name) %>%
        summarise(share=sum(share)) %>%
            ggplot(aes(y=share,x=Decade,group=name,fill=name))+geom_col(position="fill",color="white")+
            theme_soft()+scale_fill_viridis_d()

subset(ldf.cat,name%in%cat_nature_power) %>%
    group_by(Decade,name) %>%
        summarise(share=sum(share)) %>%
            ggplot(aes(y=share,x=Decade,group=name,fill=name))+geom_col(position="fill",color="white")+
            theme_soft()+scale_fill_viridis_d()

subset(ldf.cat,name%in%cat_human_power) %>%
    group_by(Decade,name) %>%
        summarise(share=sum(share)) %>%
            ggplot(aes(y=share,x=Decade,group=name,fill=name))+geom_col(position="fill",color="white")+
            theme_soft()+scale_fill_viridis_d()

subset(ldf.cat,name%in%cat_protection) %>%
    group_by(Decade,name) %>%
        summarise(share=sum(share)) %>%
            ggplot(aes(y=share,x=Decade,group=name,fill=name))+geom_col(position="fill",color="white")+
            theme_soft()+scale_fill_viridis_d()

subset(ldf.cat,name%in%cat_domination) %>%
    group_by(Decade,Corpus,name) %>%
        summarise(share=sum(share)) %>%
            ggplot(aes(y=share,x=Decade,group=name,fill=name))+geom_col(position="fill",color="white")+
            theme_soft()+scale_fill_viridis_d()+facet_grid(Corpus~.)
subset(ldf.cat,name%in%cat_domination) %>%
    group_by(Decade,Corpus,name) %>%
        summarise(share=sum(share)) %>%
            ggplot(aes(y=share,x=Decade,group=name,fill=name))+geom_col(position="stack",color="white")+
            theme_soft()+scale_fill_viridis_d()+facet_grid(Corpus~.)
            

df.nature %>%
    group_by(Decade,Corpus) %>%
        summarise(nature_article_share=mean(as.numeric(nature_salience>50),na.rm=TRUE)) -> df.nature_salience

df.nature %>%
    group_by(Year,Corpus) %>%
        summarise(nature_article_share=mean(as.numeric(nature_salience>50),na.rm=TRUE)) -> dfy.nature_salience

ldfd.nature <- ldf.nature %>%
    group_by(name,Decade,Corpus) %>%
        summarise(mean=mean(value))

ldfy.nature <- ldf.nature %>%
    group_by(name,Year,Corpus) %>%
        summarise(mean=mean(value))

ldfd.nature$name_o <- factor(ldfd.nature$name,ordered=TRUE,levels=c("descriptive_salience", "analytical_salience", "inspiration_salience", "nature_power_salience", "human_power_salience", "protection_salience"))

ggplot(ldfd.nature,aes(y=mean,x=Decade,group=name,fill=name))+geom_col()+facet_grid(Corpus~.)+theme_soft()+scale_color_viridis_d()+scale_fill_viridis_d()


ggplot(ldfd.nature,aes(y=mean,x=Decade,group=name,fill=name))+geom_col(position="fill")+facet_grid(Corpus~.)+theme_soft()+scale_color_viridis_d()+scale_fill_viridis_d()

ggd_nature_persp_comp <- ggplot()+geom_col(data=subset(ldfd.nature,name_o!="descriptive_salience"),aes(y=mean,x=Decade,group=name_o,fill=name_o),position="fill")+
    geom_point(data=df.nature_salience,aes(y=nature_article_share*10,x=Decade,shape=Corpus),size=3.5)+
    geom_point(data=df.nature_salience,aes(y=nature_article_share*10,x=Decade,shape=Corpus),size=2.0,color="white")+
    geom_point(data=dfd.corpus,aes(y=n/36273,x=Decade,shape=Corpus),size=3.5,color="#f16913")+
    geom_point(data=dfd.corpus,aes(y=n/36273,x=Decade,shape=Corpus),size=2.0,color="#8c2d04")+
    facet_grid(Corpus~.)+theme_soft()+scale_color_viridis_d()+scale_fill_viridis_d()

ggd_nature_persp_comp <- ggplot()+geom_col(data=subset(ldfd.nature,name_o!="descriptive_salience"),aes(y=mean,x=Decade,group=name_o,fill=name_o),position="fill")+
    geom_point(data=df.nature_salience,aes(y=nature_article_share/0.093,x=Decade,shape=Corpus),size=3.5)+
    geom_point(data=df.nature_salience,aes(y=nature_article_share/0.093,x=Decade,shape=Corpus),size=2.0,color="white")+
    geom_point(data=dfd.corpus,aes(y=n.full/916888,x=Decade,shape=Corpus),size=3.5,color="#f16913")+
    geom_point(data=dfd.corpus,aes(y=n.full/916888,x=Decade,shape=Corpus),size=2.0,color="#8c2d04")+
    facet_grid(Corpus~.)+theme_soft()+scale_color_viridis_d()+scale_fill_viridis_d()

ggd_nature_persp_abs <- ggplot()+geom_col(data=subset(ldfd.nature,name_o!="descriptive_salience"),aes(y=mean,x=Decade,group=name_o,fill=name_o),position="stack")+
    geom_point(data=df.nature_salience,aes(y=100*nature_article_share/0.093,x=Decade,shape=Corpus),size=3.5)+
    geom_point(data=df.nature_salience,aes(y=100*nature_article_share/0.093,x=Decade,shape=Corpus),size=2.0,color="white")+
    geom_point(data=dfd.corpus,aes(y=100*n.full/916888,x=Decade,shape=Corpus),size=3.5,color="#f16913")+
    geom_point(data=dfd.corpus,aes(y=100*n.full/916888,x=Decade,shape=Corpus),size=2.0,color="#8c2d04")+
    facet_grid(Corpus~.)+theme_soft()+scale_color_viridis_d()+scale_fill_viridis_d()

ggsave(ggd_nature_persp_comp,file="nat-persp-fill-decade.svg",device="svg",unit="cm",width=16,height=8,dpi=1200,scale=1.5)
ggsave(ggd_nature_persp_abs ,file="nat-persp-stack-decade.svg",device="svg",unit="cm",width=16,height=8,dpi=1200,scale=1.5)


ggd_xnature_persp_comp <- ggplot()+geom_col(data=(ldfd.nature),aes(y=mean,x=Decade,group=name_o,fill=name_o),position="fill")+
    geom_point(data=df.nature_salience,aes(y=nature_article_share/0.093,x=Decade,shape=Corpus),size=3.5)+
    geom_point(data=df.nature_salience,aes(y=nature_article_share/0.093,x=Decade,shape=Corpus),size=2.0,color="white")+
    geom_point(data=dfd.corpus,aes(y=n.full/916888,x=Decade,shape=Corpus),size=3.5,color="#f16913")+
    geom_point(data=dfd.corpus,aes(y=n.full/916888,x=Decade,shape=Corpus),size=2.0,color="#8c2d04")+
    facet_grid(Corpus~.)+theme_soft()+scale_color_viridis_d()+scale_fill_viridis_d()

ggd_xnature_persp_abs <- ggplot()+geom_col(data=(ldfd.nature),aes(y=mean,x=Decade,group=name_o,fill=name_o),position="stack")+
    geom_point(data=df.nature_salience,aes(y=100*nature_article_share/0.093,x=Decade,shape=Corpus),size=3.5)+
    geom_point(data=df.nature_salience,aes(y=100*nature_article_share/0.093,x=Decade,shape=Corpus),size=2.0,color="white")+
    geom_point(data=dfd.corpus,aes(y=100*n.full/916888,x=Decade,shape=Corpus),size=3.5,color="#f16913")+
    geom_point(data=dfd.corpus,aes(y=100*n.full/916888,x=Decade,shape=Corpus),size=2.0,color="#8c2d04")+
    facet_grid(Corpus~.)+theme_soft()+scale_color_viridis_d()+scale_fill_viridis_d()

ggsave(ggd_xnature_persp_comp,file="nat-xpersp-fill-decade.svg",device="svg",unit="cm",width=16,height=8,dpi=1200,scale=1.5)
ggsave(ggd_xnature_persp_abs ,file="nat-xpersp-stack-decade.svg",device="svg",unit="cm",width=16,height=8,dpi=1200,scale=1.5)



ggplot()+geom_col(data=subset(ldfy.nature,name!="descriptive_salience"),aes(y=mean,x=Year,group=name,fill=name),color="white",position="fill")+theme_soft()+scale_color_viridis_d()+scale_fill_viridis_d()+geom_point(data=dfy.nature_salience,aes(y=nature_article_share*3,x=Year,shape=Corpus),size=3.5)+geom_point(data=dfy.nature_salience,aes(y=nature_article_share*3,x=Year,shape=Corpus),size=2.0,color="white")+facet_grid(Corpus~.)



df.nature %>% 
    group_by(Decade,Corpus) %>%
        summarise(nature_salience=mean(nature_salience,na.rm=TRUE)) %>%
            ggplot(aes(y=nature_salience,x=Decade,shape=Corpus,color=Corpus,fill=Corpus))+geom_point()+geom_smooth()+theme_soft()+scale_color_viridis_d()+scale_fill_viridis_d()+ylim(0,20)

df.nature %>% 
    group_by(Year,Corpus) %>%
        summarise(nature_salience=mean(nature_salience,na.rm=TRUE)) %>%
            ggplot(aes(y=nature_salience,x=Year,shape=Corpus,color=Corpus,fill=Corpus))+geom_point()+geom_smooth()+theme_soft()+scale_color_viridis_d(begin=.15,end=.65)+scale_fill_viridis_d(begin=.00,end=.50)+ylim(0,20) -> ggy_nature_salience

ggsave(ggy_nature_salience ,file="nat-salience-smooth-year.svg",device="svg",unit="cm",width=16,height=8,dpi=1200,scale=1.5)


df.nature %>% 
    group_by(Year,Corpus) %>%
        summarise(nature_salience=mean(nature_salience>50,na.rm=TRUE)) %>%
            ggplot(aes(y=nature_salience,x=Year,shape=Corpus,color=Corpus,fill=Corpus))+geom_point()+geom_smooth()+theme_soft()+scale_color_viridis_d(begin=.15,end=.65)+scale_fill_viridis_d(begin=.00,end=.50)+ylim(0,0.25) -> ggy_nature_gt50pm

ggsave(ggy_nature_gt50pm ,file="nat-gt50-smooth-year.svg",device="svg",unit="cm",width=16,height=8,dpi=1200,scale=1.5)



plot(tapply(df.nature$nature_salience,df.nature$Decade,mean,na.rm=TRUE))
plot(tapply(df.nature$nature_salience>50,df.nature$Decade,mean,na.rm=TRUE))

ggplot(ldf.nature,aes(y=value,x=Decade,group=name,fill=name))+geom_col()+theme_soft()+scale_color_viridis_d()

df.nature %>%
    group_by(year,Corpus) %>%
        summarise(  places=sum(places),
                    tourism=sum(tourism),
                    climate=sum(climate),
                    analytical=sum(analytical),
                    landscape=sum(landscape),
                    environment=sum(environment),
                    lifeform_general=sum(lifeform_general),
                    lifeform_plants=sum(lifeform_plants),
                    lifeform_mammal=sum(lifeform_mammal),
                    lifeform_insects=sum(lifeform_insects),
                    lifeform_microbes=sum(lifeform_microbes),
                    lifeform_birds=sum(lifeform_birds),
                    lifeform_water=sum(lifeform_water),
                    geospread=sum(geospread),
                    abundance=sum(abundance),
                    reproduction=sum(reproduction),
                    bioprocesses=sum(bioprocesses),
                    evolution=sum(evolution),
                    damage_illness=sum(damage_illness),
                    damage_wound=sum(damage_wound),
                    threat_to_nature=sum(threat_to_nature),
                    threat_by_nature=sum(threat_by_nature),
                    seasonality=sum(seasonality),
                    human_perspective=sum(human_perspective),
                    human_extraction=sum(human_extraction),
                    human_gardening=sum(human_gardening),
                    food=sum(food),
                    human_domination=sum(human_domination),
                    human_treatment=sum(human_treatment),
                    human_protection=sum(human_protection),
                    human_exploration=sum(human_exploration),
                    human_inquiry=sum(human_inquiry),
                    religion=sum(religion),
                    awe_inspire=sum(awe_inspire),
                    primitive=sum(primitive),
                    rurality=sum(rurality),
                    building=sum(building),
                    nature=sum(nature),
                    n=n()) -> dfy_nature
                    
dfy_nature$descriptive <- dfy_nature$places+dfy_nature$landscape+dfy_nature$environment+dfy_nature$lifeform_birds+dfy_nature$lifeform_general+dfy_nature$lifeform_insects+dfy_nature$lifeform_mammal+dfy_nature$lifeform_microbes+dfy_nature$lifeform_plants+dfy_nature$lifeform_water

dfy_nature$analytical <- dfy_nature$analytical+dfy_nature$geospread+dfy_nature$reproduction+dfy_nature$evolution+dfy_nature$human_inquiry

dfy_nature$domination <- dfy_nature$threat_by_nature+dfy_nature$seasonality+dfy_nature$human_perspective+dfy_nature$human_extraction+dfy_nature$human_gardening+dfy_nature$food+dfy_nature$human_domination+dfy_nature$human_treatment

dfy_nature$inspiration <- dfy_nature$religion+dfy_nature$awe_inspire+dfy_nature$seasonality+dfy_nature$rurality+dfy_nature$human_exploration+dfy_nature$primitive

dfy_nature$protection <- dfy_nature$climate+dfy_nature$human_protection+dfy_nature$climate+dfy_nature$human_treatment



dfy_nature$nature_share <- dfy_nature$nature/dfy_nature$n
dfy_nature$nature_count <- dfy_nature$nature
                    
ldfy_nature <- pivot_longer(dfy_nature,cols=3:40)
ldfy_nature$share <- ldfy_nature$value/ldfy_nature$n
ldfy_nature$share2 <- ldfy_nature$value/ldfy_nature$nature_count

ldfy_nature$Year <- as.numeric(ldfy_nature$year)
ldfy_nature$dyear <- as.numeric(ldfy_nature$year)-1785




                    
ggplot(subset(ldfy_nature,name!="nature"),aes(x=Year,y=share2))+geom_point()+geom_smooth(span=.25)+facet_grid(name~Corpus)+theme_soft()


                        
#                       surrounding=c(),
#                       live_off_nature=c(),
#                       knowledge_object=c(),
#                       knowledge_inspire=c(),
#                       challenge_explore=c(),
#                       threat_overcome=c(),
#                       threat_helpless=c(),
#                       threat_apocalypse=c(),
#                       extract_careless=c(),
#                       extract_sustainable=c(),
#                       protect=c(),

4.5 Named Entity Recognition

Code
spacy_initialize()

load("m:\\users\\stefange\\onedrive - ntnu\\2-data\\crisis-collab\\noncrisis+corpus\\nc_text.RData")
load("M:\\Users\\stefange\\Onedrive - NTNU\\2-Data\\crisis-collab\\df_text.RData")

cc.text <- df.text
cc.text$year_num <- as.numeric(cc.text$year)


nc.text <- nc.r
nc.text$text <- str_to_lower(nc.r$text)
nc.text$year_num <- as.numeric(nc.text$year)

# cc.text <- df.text

ylist <- 1785:2020

cc_GPE_year <- list()
nc_GPE_year <- list()
cc_PERSON_year <- list()
nc_PERSON_year <- list()
cc_ORG_year <- list()
nc_ORG_year <- list()
cc_NORP_year <- list()
nc_NORP_year <- list()

nc.tx.PERSON <- list()
nc.tx.ORG <- list()
nc.tx.GPE <- list()
nc.tx.NORP <- list()

for (i in 1:dim(nc.text)[1]){
        txt <- nc.text[i,]
        tmp <- spacy_parse(txt$text, lemma = F, entity = T, dependency = T)
        ent <- entity_extract(tmp)
        if (dim(ent)[1]>0){
            ent$year <- txt$year
            ent$id <- txt$id
            ent$length <- ntoken(txt$text)}
        empty_row <- data.frame(doc_id=NA,sentence_id=NA,entity=NA,entity_type=NA,year=txt$year,id=txt$id,length=ntoken(txt$text))

        if (dim(ent)[1]>0){
            PERSON <-  subset(ent,entity_type=="PERSON")
            ORG <-  subset(ent,entity_type=="ORG")
            GPE <-  subset(ent,entity_type=="GPE")
            NORP <-  subset(ent,entity_type=="NORP")
            nc.tx.PERSON[[i]] <- PERSON
            nc.tx.ORG[[i]] <- ORG
            nc.tx.GPE[[i]] <- GPE
            nc.tx.NORP[[i]] <- NORP
            }
        if (dim(ent)[1]==0){
            nc.tx.PERSON[[i]] <- empty_row
            nc.tx.ORG[[i]] <- empty_row
            nc.tx.GPE[[i]] <- empty_row
            nc.tx.NORP[[i]] <- empty_row
            }
        if(i %in% seq(0,120000,100)){print(i)}
        flush.console()
}


cc.tx.PERSON <- list()
cc.tx.ORG <- list()
cc.tx.GPE <- list()
cc.tx.NORP <- list()

for (i in 1:dim(cc.text)[1]){
        txt <- cc.text[i,]
        tmp <- spacy_parse(txt$text, lemma = F, entity = T, dependency = T)
        ent <- entity_extract(tmp)
        if (dim(ent)[1]>0){
            ent$year <- txt$year
            ent$id <- txt$id
            ent$length <- ntoken(txt$text)}
        empty_row <- data.frame(doc_id=NA,sentence_id=NA,entity=NA,entity_type=NA,year=txt$year,id=txt$id,length=ntoken(txt$text))

        if (dim(ent)[1]>0){
            PERSON <-  subset(ent,entity_type=="PERSON")
            ORG <-  subset(ent,entity_type=="ORG")
            GPE <-  subset(ent,entity_type=="GPE")
            NORP <-  subset(ent,entity_type=="NORP")
            cc.tx.PERSON[[i]] <- PERSON
            cc.tx.ORG[[i]] <- ORG
            cc.tx.GPE[[i]] <- GPE
            cc.tx.NORP[[i]] <- NORP
            }
        if (dim(ent)[1]==0){
            cc.tx.PERSON[[i]] <- empty_row
            cc.tx.ORG[[i]] <- empty_row
            cc.tx.GPE[[i]] <- empty_row
            cc.tx.NORP[[i]] <- empty_row
            }
        if(i %in% seq(0,188000,100)){print(i)}
        flush.console()
}

save(cc.tx.PERSON,file="cc_person_ner.RData")
save(cc.tx.ORG,file="cc_org_ner.RData")
save(cc.tx.GPE,file="cc_gpe_ner.RData")
save(cc.tx.NORP,file="cc_norp_ner.RData")

save(nc.tx.PERSON,file="nc_person_ner.RData")
save(nc.tx.ORG,file="nc_org_ner.RData")
save(nc.tx.GPE,file="nc_gpe_ner.RData")
save(nc.tx.NORP,file="nc_norp_ner.RData")

nc_tx_PERSON <- do.call("rbind",nc.tx.PERSON)
cc_tx_PERSON <- do.call("rbind",cc.tx.PERSON)
nc_tx_ORG <- do.call("rbind",nc.tx.ORG)
cc_tx_ORG <- do.call("rbind",cc.tx.ORG)
nc_tx_GPE <- do.call("rbind",nc.tx.GPE)
cc_tx_GPE <- do.call("rbind",cc.tx.GPE)
nc_tx_NORP <- do.call("rbind",nc.tx.NORP)
cc_tx_NORP <- do.call("rbind",cc.tx.NORP)

nc_tx_PERSON$corpus <- "noncrisis"
nc_tx_ORG$corpus <- "noncrisis"
nc_tx_GPE$corpus <- "noncrisis"
nc_tx_NORP$corpus <- "noncrisis"
cc_tx_PERSON$corpus <- "crisis"
cc_tx_ORG$corpus <- "crisis"
cc_tx_GPE$corpus <- "crisis"
cc_tx_NORP$corpus <- "crisis"

tx_PERSON <- rbind(cc_tx_PERSON,nc_tx_PERSON)
tx_ORG <- rbind(cc_tx_ORG,nc_tx_ORG)
tx_GPE <- rbind(cc_tx_GPE,nc_tx_GPE)
tx_NORP <- rbind(cc_tx_NORP,nc_tx_NORP)

nc_PPW <- table(nc_tx_PERSON$entity,nc_tx_PERSON$id)

otab <- function(x,n){
    tab <- table(x)
    xtab <- tab[tab>(n-1)]
    ot <- xtab[order(xtab,decreasing=TRUE)]
    return(ot)
    }

cc_orgs <- list()
for (i in 1785:2020){
    cc_orgs[[i]] <- otab(subset(cc_tx_ORG,year==i)$entity,2)
    }
    
    
persons_o100 <- otab(tx_PERSON$entity,100)
orgs_o100 <- otab(tx_ORG$entity,100)
gpe_o100 <- otab(tx_GPE$entity,100)
norp_o100 <- otab(tx_NORP$entity,100)

for (i in 1785:2020){
    yearly_cc_PERSON <- subset(tx_PERSON,year==i & corpus=="crisis")
    yearly_nc_PERSON <- subset(tx_PERSON,year==i & corpus=="noncrisis")
    yearly_cc_ORG <- subset(tx_ORG,year==i & corpus=="crisis")
    yearly_nc_ORG <- subset(tx_ORG,year==i & corpus=="noncrisis")
    otab_PERSON_cc <- otab(x=yearly_cc_PERSON$entity,n=2)
    otab_PERSON_nc <- otab(x=yearly_nc_PERSON$entity,n=2)
    otab_ORG_cc <- otab(x=yearly_cc_ORG$entity,n=2)
    otab_ORG_nc <- otab(x=yearly_nc_ORG$entity,n=2)
    actorcount_cc <- length(otab_PERSON_cc) 
    actorcount_nc <- length(otab_PERSON_nc)
    orgcount_cc <- length(otab_ORG_cc) 
    orgcount_nc <- length(otab_ORG_nc)
    thetimes[thetimes$year==i,"PERSON.count_cc"] <- actorcount_cc
    thetimes[thetimes$year==i,"PERSON.count_nc"] <- actorcount_nc
    thetimes[thetimes$year==i,"ORG.count_cc"] <- orgcount_cc
    thetimes[thetimes$year==i,"ORG.count_nc"] <- orgcount_nc
    print(i)
    flush.console()
    }


for (i in 1785:2020){
    yearly_cc_PERSON <- subset(tx_PERSON,year==i & corpus=="crisis")
    yearly_nc_PERSON <- subset(tx_PERSON,year==i & corpus=="noncrisis")
    otab_cc <- otab(x=yearly_cc_PERSON$entity,n=5)
    otab_nc <- otab(x=yearly_nc_PERSON$entity,n=5)
    Gini_cc <- Gini(otab_cc) 
    Gini_nc <- Gini(otab_nc)
    thetimes[thetimes$year==i,"PERSON.gini_cc"] <- Gini_cc
    thetimes[thetimes$year==i,"PERSON.gini_nc"] <- Gini_nc
    print(i)
    flush.console()
    }

for (i in 1785:2020){
    yearly_cc_GPE <- subset(tx_GPE,year==i & corpus=="crisis")
    yearly_nc_GPE <- subset(tx_GPE,year==i & corpus=="noncrisis")
    otab_cc <- otab(x=yearly_cc_GPE$entity,n=5)
    otab_nc <- otab(x=yearly_nc_GPE$entity,n=5)
    Gini_cc <- Gini(otab_cc) 
    Gini_nc <- Gini(otab_nc)
    thetimes[thetimes$year==i,"GPE.gini_cc"] <- Gini_cc
    thetimes[thetimes$year==i,"GPE.gini_nc"] <- Gini_nc
    print(i)
    flush.console()
    }

for (i in 1785:2020){
    yearly_cc_NORP <- subset(tx_NORP,year==i & corpus=="crisis")
    yearly_nc_NORP <- subset(tx_NORP,year==i & corpus=="noncrisis")
    otab_cc <- otab(x=yearly_cc_NORP$entity,n=5)
    otab_nc <- otab(x=yearly_nc_NORP$entity,n=5)
    Gini_cc <- Gini(otab_cc) 
    Gini_nc <- Gini(otab_nc)
    thetimes[thetimes$year==i,"NORP.gini_cc"] <- Gini_cc
    thetimes[thetimes$year==i,"NORP.gini_nc"] <- Gini_nc
    print(i)
    flush.console()
    }

for (i in 1785:2020){
    yearly_cc_ORG <- subset(tx_ORG,year==i & corpus=="crisis")
    yearly_nc_ORG <- subset(tx_ORG,year==i & corpus=="noncrisis")
    otab_cc <- otab(x=yearly_cc_ORG$entity,n=5)
    otab_nc <- otab(x=yearly_nc_ORG$entity,n=5)
    Gini_cc <- Gini(otab_cc) 
    Gini_nc <- Gini(otab_nc)
    thetimes[thetimes$year==i,"ORG.gini_cc"] <- Gini_cc
    thetimes[thetimes$year==i,"ORG.gini_nc"] <- Gini_nc
    print(i)
    flush.console()
    }

plot(otab(tx_PERSON$entity,n=1000))


thetimes2 <- rbind(thetimes,thetimes,thetimes)
thetimes2$corpus <- rep(c("crisis", "noncrisis", "wave"),each=236)
thetimes2$PERSON_gini <- c(thetimes$PERSON.gini_cc,thetimes$PERSON.gini_nc,thetimes$PERSON.gini_cnw)
thetimes2$GPE_gini <- c(thetimes$GPE.gini_cc,thetimes$GPE.gini_nc,thetimes$GPE.gini_cnw)
thetimes2$ORG_gini <- c(thetimes$ORG.gini_cc,thetimes$ORG.gini_nc,thetimes$ORG.gini_cnw)
thetimes2$NORP_gini <- c(thetimes$NORP.gini_cc,thetimes$NORP.gini_nc,thetimes$NORP.gini_cnw)

# Create a new theme
theme_soft <- function (base_size = 11, base_family = "Open Sans") {
    theme_bw() %+replace% 
    theme(
      panel.grid.major  = element_line(color = "#ffffff",linetype="solid",size=1.0),
      panel.grid.minor  = element_line(color = "#ffffff",linetype="dotted",size=1.0),
      panel.background = element_rect(fill = "#f6e0b5"),
      panel.border = element_rect(color = "#ffffff", fill = NA, size=1.0),
    plot.background= element_rect(fill="#fff4e6"),
      axis.line = element_line(color = "#ffffff",size=1.0),
      axis.ticks = element_line(color = "#ffffff"),
      axis.text = element_text(color = "#000000"),
    axis.text.x = element_text(angle=45,hjust=1,vjust=1),
    axis.title = element_text(face="bold"),
    strip.background = element_rect(color="#967259",fill="#be9b7b"),
    strip.text = element_text(color="white",face="bold"),
    legend.background = element_rect(fill=alpha("#f6e0b5",.5),colour = "#937342")
      )
}


gg_PERSON <- ggplot(thetimes2,aes(y=1-PERSON_gini,x=year,color=corpus,fill=corpus,shape=corpus))+geom_point()+geom_smooth(span=.15,se=FALSE)+scale_color_viridis_d(option="turbo",begin=.1,end=.85)+scale_fill_viridis_d(option="turbo",begin=.1,end=.85)+theme_soft()+scale_x_continuous(breaks=seq(1800,2020,50),minor_breaks=seq(1780,2020,10))+ylab("Diversity: Persons [1-Gini]")+theme(legend.position=c(0.1,0.1))+ylim(0,1)

gg_ORG <- ggplot(thetimes2,aes(y=1-ORG_gini,x=year,color=corpus,fill=corpus,shape=corpus))+geom_point()+geom_smooth(span=.15,se=FALSE)+scale_color_viridis_d(option="turbo",begin=.1,end=.85)+scale_fill_viridis_d(option="turbo",begin=.1,end=.85)+theme_soft()+scale_x_continuous(breaks=seq(1800,2020,50),minor_breaks=seq(1780,2020,10))+ylab("Diversity: Organizations [1-Gini]")+theme(legend.position=c(0.1,0.1))+ylim(0,1)

gg_GPE <- ggplot(thetimes2,aes(y=1-GPE_gini,x=year,color=corpus,fill=corpus,shape=corpus))+geom_point()+geom_smooth(span=.15,se=FALSE)+scale_color_viridis_d(option="turbo",begin=.1,end=.85)+scale_fill_viridis_d(option="turbo",begin=.1,end=.85)+theme_soft()+scale_x_continuous(breaks=seq(1800,2020,50),minor_breaks=seq(1780,2020,10))+ylab("Diversity: Geopolitical entities [1-Gini]")+theme(legend.position=c(0.1,0.1))+ylim(0,1)

gg_NORP <- ggplot(thetimes2,aes(y=1-NORP_gini,x=year,color=corpus,fill=corpus,shape=corpus))+geom_point()+geom_smooth(span=.15,se=FALSE)+scale_color_viridis_d(option="turbo",begin=.1,end=.85)+scale_fill_viridis_d(option="turbo",begin=.1,end=.85)+theme_soft()+scale_x_continuous(breaks=seq(1800,2020,50),minor_breaks=seq(1780,2020,10))+ylab("Diversity: Nationalities, regigious, political groups [1-Gini]")+theme(legend.position=c(0.1,0.1))+ylim(0,1)

grid.arrange(gg_PERSON,gg_ORG,gg_GPE,gg_NORP,ncol=2)

thetimes2$ocorpus <- factor(thetimes2$corpus,ordered=TRUE,levels=c("noncrisis", "crisis", "wave"))
thetimes2$fcorpus <- factor(thetimes2$corpus,ordered=FALSE,levels=c("noncrisis", "crisis", "wave"))
thetimes2$year_num <- as.numeric(thetimes2$year)

summary(glht(aov(1-NORP_gini~ocorpus+year_num,data=thetimes2),linfct=mcp(fcorpus="Tukey")))
summary(glht(aov(1-GPE_gini~ocorpus+year_num,data=thetimes2),linfct=mcp(fcorpus="Tukey")))
summary(glht(aov(1-ORG_gini~ocorpus+year_num,data=thetimes2),linfct=mcp(fcorpus="Tukey")))
summary(glht(aov(1-PERSON_gini~ocorpus+year_num,data=thetimes2),linfct=mcp(fcorpus="Tukey")))

summary(lm(1-NORP_gini~fcorpus*year_num,data=thetimes2))
summary(lm(1-GPE_gini~fcorpus*year_num,data=thetimes2))
summary(lm(1-ORG_gini~fcorpus*year_num,data=thetimes2))
summary(lm(1-PERSON_gini~fcorpus*year_num,data=thetimes2))


cnw_ids <- (unique(subset(TOTAL.df,prob>.05)$id))

cnw_tx <- subset(cc.text,id%in%cnw_ids)

cnw.text <- cnw_tx

cnw.tx.PERSON <- list()
cnw.tx.ORG <- list()
cnw.tx.GPE <- list()
cnw.tx.NORP <- list()

for (i in 1:dim(cnw.text)[1]){
        txt <- cnw.text[i,]
        tmp <- spacy_parse(txt$text, lemma = F, entity = T, dependency = T)
        ent <- entity_extract(tmp)
        if (dim(ent)[1]>0){
            ent$year <- txt$year
            ent$id <- txt$id
            ent$length <- ntoken(txt$text)}
        empty_row <- data.frame(doc_id=NA,sentence_id=NA,entity=NA,entity_type=NA,year=txt$year,id=txt$id,length=ntoken(txt$text))

        if (dim(ent)[1]>0){
            PERSON <-  subset(ent,entity_type=="PERSON")
            ORG <-  subset(ent,entity_type=="ORG")
            GPE <-  subset(ent,entity_type=="GPE")
            NORP <-  subset(ent,entity_type=="NORP")
            cnw.tx.PERSON[[i]] <- PERSON
            cnw.tx.ORG[[i]] <- ORG
            cnw.tx.GPE[[i]] <- GPE
            cnw.tx.NORP[[i]] <- NORP
            }
        if (dim(ent)[1]==0){
            cnw.tx.PERSON[[i]] <- empty_row
            cnw.tx.ORG[[i]] <- empty_row
            cnw.tx.GPE[[i]] <- empty_row
            cnw.tx.NORP[[i]] <- empty_row
            }
        if(i %in% seq(0,188000,100)){print(i)}
        flush.console()
}

cnw_tx_PERSON <- do.call("rbind",cnw.tx.PERSON)
cnw_tx_ORG <- do.call("rbind",cnw.tx.ORG)
cnw_tx_GPE <- do.call("rbind",cnw.tx.GPE)
cnw_tx_NORP <- do.call("rbind",cnw.tx.NORP)

cnw_tx_PERSON$corpus <- "wave"
cnw_tx_ORG$corpus <- "wave"
cnw_tx_GPE$corpus <- "wave"
cnw_tx_NORP$corpus <- "wave"

for (i in 1785:2020){
    yearly_cnw_ORG <- subset(cnw_tx_ORG,year==i & corpus=="wave")
    yearly_cnw_PERSON <- subset(cnw_tx_PERSON,year==i & corpus=="wave")
    yearly_cnw_GPE <- subset(cnw_tx_GPE,year==i & corpus=="wave")
    yearly_cnw_NORP <- subset(cnw_tx_NORP,year==i & corpus=="wave")
    otab_cnw_ORG <- otab(x=yearly_cnw_ORG$entity,n=5)
    otab_cnw_PERSON <- otab(x=yearly_cnw_PERSON$entity,n=5)
    otab_cnw_GPE <- otab(x=yearly_cnw_GPE$entity,n=5)
    otab_cnw_NORP <- otab(x=yearly_cnw_NORP$entity,n=5)
    Gini_cnw_ORG <- Gini(otab_cnw_ORG) 
    Gini_cnw_PERSON <- Gini(otab_cnw_PERSON)
    Gini_cnw_GPE <- Gini(otab_cnw_GPE) 
    Gini_cnw_NORP <- Gini(otab_cnw_NORP)
    thetimes[thetimes$year==i,"ORG.gini_cnw"] <- Gini_cnw_ORG
    thetimes[thetimes$year==i,"PERSON.gini_cnw"] <- Gini_cnw_PERSON
    thetimes[thetimes$year==i,"GPE.gini_cnw"] <- Gini_cnw_GPE
    thetimes[thetimes$year==i,"NORP.gini_cnw"] <- Gini_cnw_NORP
    print(i)
    flush.console()
    }

plot(otab(tx_PERSON$entity,n=1000))




otab(x=tx_PERSON$entity,n=1000)
otab(x=subset(nc_tx_PERSON,year==1785)$entity,n=5)
otab(x=subset(nc_tx_PERSON,year==1786)$entity,n=5)
otab(x=subset(nc_tx_PERSON,year==1785)$entity,n=5)
otab(x=subset(nc_tx_PERSON,year==1785)$entity,n=5)
otab(x=subset(nc_tx_PERSON,year==1785)$entity,n=5)
otab(subset(nc_tx_GPE,year==1785)$entity)
otab(subset(nc_tx_GPE,year==2020)$entity)

otab(subset(nc_tx_ORG,year==2000)$entity)

otab(nc_tx_ORG$entity)[1:100]



for (i in 1:length(ylist)) {

    if (sum(cc.text$year_num==ylist[[i]],na.rm=TRUE)>0) {
        cc.tx <- subset(cc.text,year_num==ylist[[i]])$text
        cc.a <- spacy_parse(cc.tx, lemma = F, entity = T, dependency = T)
        cc.x <- entity_extract(cc.a)
        cc.GPE <- subset(cc.x,entity_type=="GPE")
        cc.NORP <- subset(cc.x,entity_type=="NORP")
        cc.ORG <- subset(cc.x,entity_type=="ORG")
        cc.PERSON <- subset(cc.x,entity_type=="PERSON")
        cc.GPE.gini <- 1-Gini(table(cc.GPE$entity))
        cc.NORP.gini <- 1-Gini(table(cc.NORP$entity))
        cc.ORG.gini <- 1-Gini(table(cc.ORG$entity))
        cc.PERSON.gini <- 1-Gini(table(cc.PERSON$entity))
        cc.GPE.count <- dim(cc.GPE)[1]
        cc.NORP.count <- dim(cc.NORP)[1]
        cc.ORG.count <- dim(cc.ORG)[1]
        cc.PERSON.count <- dim(cc.PERSON)[1]
        yeardat[yeardat$year==ylist[[i]],"cc.GPE.gini"] <- cc.GPE.gini
        yeardat[yeardat$year==ylist[[i]],"cc.NORP.gini"] <- cc.NORP.gini
        yeardat[yeardat$year==ylist[[i]],"cc.ORG.gini"] <- cc.ORG.gini
        yeardat[yeardat$year==ylist[[i]],"cc.PERSON.gini"] <- cc.PERSON.gini
        yeardat[yeardat$year==ylist[[i]],"cc.GPE.count"] <- cc.GPE.count
        yeardat[yeardat$year==ylist[[i]],"cc.NORP.count"] <- cc.NORP.count
        yeardat[yeardat$year==ylist[[i]],"cc.ORG.count"] <- cc.ORG.count
        yeardat[yeardat$year==ylist[[i]],"cc.PERSON.count"] <- cc.PERSON.count
        cc_GPE_year[[ylist[[i]]]] <- cc.GPE[order(cc.GPE$,decreasing=TRUE)]
        cc_PERSON_year[[ylist[[i]]]] <- cc.PERSON[order(cc.PERSON$,decreasing=TRUE)]
        cc_ORG_year[[ylist[[i]]]] <- cc.ORG[order(cc.ORG$,decreasing=TRUE)]
        cc_NORP_year[[ylist[[i]]]] <- cc.NORP[order(cc.NORP$,decreasing=TRUE)]
        }
    else {
        yeardat[yeardat$year==ylist[[i]],"cc.GPE.gini"] <- NA
        yeardat[yeardat$year==ylist[[i]],"cc.NORP.gini"] <- NA
        yeardat[yeardat$year==ylist[[i]],"cc.ORG.gini"] <- NA
        yeardat[yeardat$year==ylist[[i]],"cc.PERSON.gini"] <- NA
        yeardat[yeardat$year==ylist[[i]],"cc.GPE.count"] <- NA
        yeardat[yeardat$year==ylist[[i]],"cc.NORP.count"] <- NA
        yeardat[yeardat$year==ylist[[i]],"cc.ORG.count"] <- NA
        yeardat[yeardat$year==ylist[[i]],"cc.PERSON.count"] <- NA
        cc_GPE_year[[ylist[[i]]]] <- NA
        cc_PERSON_year[[ylist[[i]]]] <- NA
        cc_ORG_year[[ylist[[i]]]] <- NA
        cc_NORP_year[[ylist[[i]]]] <- NA
        }

    if (sum(nc.text$year_num==ylist[[i]],na.rm=TRUE)>0) {
        nc.tx <- subset(nc.text,year_num==ylist[[i]])$text
        nc.a <- spacy_parse(nc.tx, lemma = F, entity = T, dependency = T)
        nc.x <- entity_extract(nc.a)
        nc.GPE <- subset(nc.x,entity_type=="GPE")
        nc.NORP <- subset(nc.x,entity_type=="NORP")
        nc.ORG <- subset(nc.x,entity_type=="ORG")
        nc.PERSON <- subset(nc.x,entity_type=="PERSON")
        nc.GPE.gini <- 1-Gini(table(nc.GPE$entity))
        nc.NORP.gini <- 1-Gini(table(nc.NORP$entity))
        nc.ORG.gini <- 1-Gini(table(nc.ORG$entity))
        nc.PERSON.gini <- 1-Gini(table(nc.PERSON$entity))
        nc.GPE.count <- dim(nc.GPE)[1]
        nc.NORP.count <- dim(nc.NORP)[1]
        nc.ORG.count <- dim(nc.ORG)[1]
        nc.PERSON.count <- dim(nc.PERSON)[1]
        yeardat[yeardat$year==ylist[[i]],"nc.GPE.gini"] <- nc.GPE.gini
        yeardat[yeardat$year==ylist[[i]],"nc.NORP.gini"] <- nc.NORP.gini
        yeardat[yeardat$year==ylist[[i]],"nc.ORG.gini"] <- nc.ORG.gini
        yeardat[yeardat$year==ylist[[i]],"nc.PERSON.gini"] <- nc.PERSON.gini
        yeardat[yeardat$year==ylist[[i]],"nc.GPE.count"] <- nc.GPE.count
        yeardat[yeardat$year==ylist[[i]],"nc.NORP.count"] <- nc.NORP.count
        yeardat[yeardat$year==ylist[[i]],"nc.ORG.count"] <- nc.ORG.count
        yeardat[yeardat$year==ylist[[i]],"nc.PERSON.count"] <- nc.PERSON.count
        nc_GPE_year[[ylist[[i]]]] <- nc.GPE[order(nc.GPE$,decreasing=TRUE)]
        nc_PERSON_year[[ylist[[i]]]] <- nc.PERSON[order(nc.PERSON$,decreasing=TRUE)]
        nc_ORG_year[[ylist[[i]]]] <- nc.ORG[order(nc.ORG$,decreasing=TRUE)]
        nc_NORP_year[[ylist[[i]]]] <- nc.NORP[order(nc.NORP$,decreasing=TRUE)]
        }
    else {
        yeardat[yeardat$year==ylist[[i]],"nc.GPE.gini"] <- NA
        yeardat[yeardat$year==ylist[[i]],"nc.NORP.gini"] <- NA
        yeardat[yeardat$year==ylist[[i]],"nc.ORG.gini"] <- NA
        yeardat[yeardat$year==ylist[[i]],"nc.PERSON.gini"] <- NA
        yeardat[yeardat$year==ylist[[i]],"nc.GPE.count"] <- NA
        yeardat[yeardat$year==ylist[[i]],"nc.NORP.count"] <- NA
        yeardat[yeardat$year==ylist[[i]],"nc.ORG.count"] <- NA
        yeardat[yeardat$year==ylist[[i]],"nc.PERSON.count"] <- NA
        nc_GPE_year[[ylist[[i]]]] <- NA
        nc_PERSON_year[[ylist[[i]]]] <- NA
        nc_ORG_year[[ylist[[i]]]] <- NA
        nc_NORP_year[[ylist[[i]]]] <- NA
        }
    print(i)
    flush.console()
}







c1700s <- subset(cc.text,year_num<1800 & year_num>1699)
c1800s <- subset(cc.text,year_num<1900 & year_num>1799)
c1900s <- subset(cc.text,year_num<1925 & year_num>1899)
c1925s <- subset(cc.text,year_num<1950 & year_num>1924)
c1950s <- subset(cc.text,year_num<1970 & year_num>1949)
c1970s <- subset(cc.text,year_num<1980 & year_num>1969)
c1980s <- subset(cc.text,year_num<1990 & year_num>1979)
c1990s <- subset(cc.text,year_num<2000 & year_num>1989)
c2000s <- subset(cc.text,year_num<2010 & year_num>1999)
c2010s <- subset(cc.text,year_num<2015 & year_num>2009)
c2015s <- subset(cc.text,year_num<2018 & year_num>2014)
c2018s <- subset(cc.text,year_num<2050 & year_num>2017)

t1700s <- subset(df.text,year_num<1800 & year_num>1699)
t1800s <- subset(df.text,year_num<1825 & year_num>1799)
t1825s <- subset(df.text,year_num<1850 & year_num>1824)
t1850s <- subset(df.text,year_num<1875 & year_num>1849)
t1875s <- subset(df.text,year_num<1900 & year_num>1874)
t1900s <- subset(df.text,year_num<1925 & year_num>1899)
t1925s <- subset(df.text,year_num<1950 & year_num>1924)
t1950s <- subset(df.text,year_num<1970 & year_num>1949)
t1970s <- subset(df.text,year_num<1980 & year_num>1969)
t1980s <- subset(df.text,year_num<1990 & year_num>1979)
t1990s <- subset(df.text,year_num<2000 & year_num>1989)
t2000s <- subset(df.text,year_num<2010 & year_num>1999)
t2010s <- subset(df.text,year_num<2015 & year_num>2009)
t2015s <- subset(df.text,year_num<2018 & year_num>2014)
t2018s <- subset(df.text,year_num<2050 & year_num>2017)

    
texts1700s <- subset(df.text,year_num<1800 & year_num>1699)$text
texts1800s <- subset(df.text,year_num<1825 & year_num>1799)$text
texts1825s <- subset(df.text,year_num<1850 & year_num>1824)$text
texts1850s <- subset(df.text,year_num<1875 & year_num>1849)$text
texts1875s <- subset(df.text,year_num<1900 & year_num>1874)$text
texts1900s <- subset(df.text,year_num<1925 & year_num>1899)$text
texts1925s <- subset(df.text,year_num<1950 & year_num>1924)$text
texts1950s <- subset(df.text,year_num<1970 & year_num>1949)$text
texts1970s <- subset(df.text,year_num<1980 & year_num>1969)$text
texts1980s <- subset(df.text,year_num<1990 & year_num>1979)$text
texts1990s <- subset(df.text,year_num<2000 & year_num>1989)$text
texts2000s <- subset(df.text,year_num<2010 & year_num>1999)$text
texts2010s <- subset(df.text,year_num<2015 & year_num>2009)$text
# texts2015s <- subset(df.text,year_num<2018 & year_num>2014)$text
# texts2018s <- subset(df.text,year_num<2050 & year_num>2017)$text

actors1970_nc <- spacy_parse(texts1970s, lemma = F, entity = T, dependency = T)
actors1980_nc <- spacy_parse(texts1980s, lemma = F, entity = T, dependency = T)
actors1990_nc <- spacy_parse(texts1990s, lemma = F, entity = T, dependency = T)
actors2000_nc <- spacy_parse(texts2000s, lemma = F, entity = T, dependency = T)
actors2010_nc <- spacy_parse(texts2010s, lemma = F, entity = T, dependency = T)
actors1700_nc <- spacy_parse(texts1700s, lemma = F, entity = T, dependency = T)
actors1950_nc <- spacy_parse(texts1950s, lemma = F, entity = T, dependency = T)
actors1925_nc <- spacy_parse(texts1925s, lemma = F, entity = T, dependency = T)
actors1900_nc <- spacy_parse(texts1900s, lemma = F, entity = T, dependency = T)
actors1875_nc <- spacy_parse(texts1875s, lemma = F, entity = T, dependency = T)
actors1850_nc <- spacy_parse(texts1850s, lemma = F, entity = T, dependency = T)
actors1825_nc <- spacy_parse(texts1825s, lemma = F, entity = T, dependency = T)
actors1800_nc <- spacy_parse(texts1800s, lemma = F, entity = T, dependency = T)
# actors2015_nc <- spacy_parse(texts2015s, lemma = F, entity = T, dependency = T)
# actors2018_nc <- spacy_parse(texts2018s, lemma = F, entity = T, dependency = T)

save(actors1700_nc,file="a1700_nc.RData")
save(actors1800_nc,file="a1800_nc.RData")
save(actors1825_nc,file="a1825_nc.RData")
save(actors1875_nc,file="a1875_nc.RData")
save(actors1900_nc,file="a1900_nc.RData")
save(actors1925_nc,file="a1925_nc.RData")
save(actors1950_nc,file="a1950_nc.RData")
save(actors1970_nc,file="a1970_nc.RData")
save(actors1980_nc,file="a1980_nc.RData")
save(actors1990_nc,file="a1990_nc.RData")
save(actors2000_nc,file="a2000_nc.RData")
save(actors2010_nc,file="a2010_nc.RData")

load("a1700_nc.RData")
load("a1825_nc.RData")
load("a1875_nc.RData")
load("a1900_nc.RData")
load("a1925_nc.RData")
load("a1950_nc.RData")
load("a1970_nc.RData")
load("a1980_nc.RData")
load("a1990_nc.RData")
load("a2000_nc.RData")
load("a2010_nc.RData")

x1700_nc <- entity_extract(actors1700_nc)
x1800_nc <- entity_extract(actors1800_nc)
x1825_nc <- entity_extract(actors1825_nc)
x1850_nc <- entity_extract(actors1850_nc)
x1875_nc <- entity_extract(actors1875_nc)
x1900_nc <- entity_extract(actors1900_nc)
x1925_nc <- entity_extract(actors1925_nc)
x1950_nc <- entity_extract(actors1950_nc)
x1970_nc <- entity_extract(actors1970_nc)
x1980_nc <- entity_extract(actors1980_nc)
x1990_nc <- entity_extract(actors1990_nc)
x2000_nc <- entity_extract(actors2000_nc)
x2010_nc <- entity_extract(actors2010_nc)

tab2010 <- table(x2010_nc$entity)
otab2010 <- tab2010[order(tab2010,decreasing=TRUE)]
items2010 <- names(otab2010[1:1000])

tab2000 <- table(x2000_nc$entity)
otab2000 <- tab2000[order(tab2000,decreasing=TRUE)]
items2000 <- names(otab2000[1:1000])

tab1990 <- table(x1990_nc$entity)
otab1990 <- tab1990[order(tab1990,decreasing=TRUE)]
items1990 <- names(otab1990[1:1000])

tab1980 <- table(x1980_nc$entity)
otab1980 <- tab1980[order(tab1980,decreasing=TRUE)]
items1980 <- names(otab1980[1:1000])

tab1970 <- table(x1970_nc$entity)
otab1970 <- tab1970[order(tab1970,decreasing=TRUE)]
items1970 <- names(otab1970[1:1000])

tab1950 <- table(x1950_nc$entity)
otab1950 <- tab1950[order(tab1950,decreasing=TRUE)]
items1950 <- names(otab1950[1:1000])

tab1925 <- table(x1925_nc$entity)
otab1925 <- tab1925[order(tab1925,decreasing=TRUE)]
items1925 <- names(otab1925[1:1000])

tab1900 <- table(x1900_nc$entity)
otab1900 <- tab1900[order(tab1900,decreasing=TRUE)]
items1900 <- names(otab1900[1:1000])

tab1875 <- table(x1875_nc$entity)
otab1875 <- tab1875[order(tab1875,decreasing=TRUE)]
items1875 <- names(otab1875[1:1000])

tab1850 <- table(x1850_nc$entity)
otab1850 <- tab1850[order(tab1850,decreasing=TRUE)]
items1850 <- names(otab1850[1:1000])

tab1825 <- table(x1825_nc$entity)
otab1825 <- tab1825[order(tab1825,decreasing=TRUE)]
items1825 <- names(otab1825[1:1000])

tab1700 <- table(x1700_nc$entity)
otab1700 <- tab1700[order(tab1700,decreasing=TRUE)]
items1700 <- names(otab1700[1:1000])

find.text.examples <- function(textset,keyword){
    keyw <- str_replace_all(keyword,"_", " ")
    all_relevant_texts_pointer <- str_detect(textset,fixed(keyword))
    textset_relevant <- textset[all_relevant_texts_pointer]
    textset_selected <- textset[runif(min=1,max=length(textset_relevant),n=min(100,length(textset_relevant)))]
    output <- tokens(textset_selected)
    kwic_output <- kwic(output,keyword,window=10)
    return(kwic_output)
    }


tok1700 <- tokens(texts1700s)
tok1825 <- tokens(texts1825s)
tok1850 <- tokens(texts1850s)
tok1875 <- tokens(texts1875s)
tok1900 <- tokens(texts1900s)
tok1925 <- tokens(texts1925s)
tok1950 <- tokens(texts1950s)
tok1970 <- tokens(texts1970s)
tok1980 <- tokens(texts1980s)
tok1990 <- tokens(texts1990s)
tok2000 <- tokens(texts2000s)
tok2010 <- tokens(texts2010s)

phrases1700 <- list()
phrases1825 <- list()
phrases1850 <- list()
phrases1875 <- list()
phrases1900 <- list()
phrases1925 <- list()
phrases1950 <- list()
phrases1970 <- list()
phrases1980 <- list()
phrases1990 <- list()
phrases2000 <- list()
phrases2010 <- list()

tokens_subset(tok1700,subset=="London")

t1700 <- str_to_lower(texts1700s)
t1825 <- str_to_lower(texts1825s)
t1850 <- str_to_lower(texts1850s)
t1875 <- str_to_lower(texts1875s)
t1900 <- str_to_lower(texts1900s)
t1925 <- str_to_lower(texts1925s)
t1950 <- str_to_lower(texts1950s)
t1970 <- str_to_lower(texts1970s)
t1980 <- str_to_lower(texts1980s)
t1990 <- str_to_lower(texts1990s)
t2000 <- str_to_lower(texts2000s)
t2010 <- str_to_lower(texts2010s)

i1700 <- str_to_lower(items1700)
i1825 <- str_to_lower(items1825)
i1850 <- str_to_lower(items1850)
i1875 <- str_to_lower(items1875)
i1900 <- str_to_lower(items1900)
i1925 <- str_to_lower(items1925)
i1950 <- str_to_lower(items1950)
i1970 <- str_to_lower(items1970)
i1980 <- str_to_lower(items1980)
i1990 <- str_to_lower(items1990)
i2000 <- str_to_lower(items2000)
i2010 <- str_to_lower(items2010)


for (i in 1:1000) {
        phrases1700[[i]] <- find.text.examples(textset=t1700 ,keyword=i1700[i])
            print("1")
        phrases1825[[i]] <- find.text.examples(textset=t1825 ,keyword=i1825[i])
            print("2")
        phrases1850[[i]] <- find.text.examples(textset=t1850 ,keyword=i1850[i])
            print("3")
        phrases1875[[i]] <- find.text.examples(textset=t1875 ,keyword=i1875[i])
            print("4")
        phrases1900[[i]] <- find.text.examples(textset=t1900 ,keyword=i1900[i])
            print("5")
        phrases1925[[i]] <- find.text.examples(textset=t1925 ,keyword=i1925[i])
            print("6")
        phrases1950[[i]] <- find.text.examples(textset=t1950 ,keyword=i1950[i])
            print("7")
        phrases1970[[i]] <- find.text.examples(textset=t1970 ,keyword=i1970[i])
            print("8")
        phrases1980[[i]] <- find.text.examples(textset=t1980 ,keyword=i1980[i])
            print("9")
        phrases1990[[i]] <- find.text.examples(textset=t1990 ,keyword=i1990[i])
            print("10")
        phrases2000[[i]] <- find.text.examples(textset=t2000 ,keyword=i2000[i])
            print("11")
        phrases2010[[i]] <- find.text.examples(textset=t2010 ,keyword=i2010[i])
            print("12")
    flush.console()
    print(i)
}

save(phrases1700,file="phr1700.RData")
save(phrases1825,file="phr1825.RData")
save(phrases1850,file="phr1850.RData")
save(phrases1875,file="phr1875.RData")
save(phrases1900,file="phr1900.RData")
save(phrases1925,file="phr1925.RData")
save(phrases1950,file="phr1950.RData")
save(phrases1970,file="phr1970.RData")
save(phrases1980,file="phr1980.RData")
save(phrases1990,file="phr1990.RData")
save(phrases2000,file="phr2000.RData")
save(phrases2010,file="phr2010.RData")


put_together_phrases <- function(x){
    charvec <- paste(x$pre,"|||",str_to_upper(x$keyword),"|||",x$post,"\n")
    singlechar <- glue_collapse(charvec)
    return(singlechar)
    }

ph1700 <- unlist(lapply(phrases1700,FUN=put_together_phrases))
ph1825 <- unlist(lapply(phrases1825,FUN=put_together_phrases))
ph1850 <- unlist(lapply(phrases1850,FUN=put_together_phrases))
ph1875 <- unlist(lapply(phrases1875,FUN=put_together_phrases))
ph1900 <- unlist(lapply(phrases1900,FUN=put_together_phrases))
ph1925 <- unlist(lapply(phrases1925,FUN=put_together_phrases))
ph1950 <- unlist(lapply(phrases1950,FUN=put_together_phrases))
ph1970 <- unlist(lapply(phrases1970,FUN=put_together_phrases))
ph1980 <- unlist(lapply(phrases1980,FUN=put_together_phrases))
ph1990 <- unlist(lapply(phrases1990,FUN=put_together_phrases))
ph2000 <- unlist(lapply(phrases2000,FUN=put_together_phrases))
ph2010 <- unlist(lapply(phrases2010,FUN=put_together_phrases))

all_items <- c(items1700,items1825,items1850,items1875,items1900,items1925,items1950,items1970,items1980,items1990,items2000,items2010) # re-insert 1850

all_phrases <- c(ph1700,ph1825,ph1850,ph1875,ph1900,ph1925,ph1950,ph1970,ph1980,ph1990,ph2000,ph2010)# re-insert 1850

df.entities <- data.frame(entity=all_items,phrases=all_phrases,era=rep(c(1700,1825,1850,1875,1900,1925,1950,1970,1980,1990,2000,2010),each=1000))

write.csv2(df.entities,file="df_entities.csv")




per2010_nc <- subset(x2010_nc,entity_type=="PERSON")
p2010_nc <- table(per2010_nc$entity)[order(table(per2010_nc$entity),decreasing=TRUE)][1:1000]
per2000_nc <- subset(x2000_nc,entity_type=="PERSON")
p2000_nc <- table(per2000_nc$entity)[order(table(per2000_nc$entity),decreasing=TRUE)][1:1000]
per1990_nc <- subset(x1990_nc,entity_type=="PERSON")
p1990_nc <- table(per1990_nc$entity)[order(table(per1990_nc$entity),decreasing=TRUE)][1:1000]
per1980_nc <- subset(x1980_nc,entity_type=="PERSON")
p1980_nc <- table(per1980_nc$entity)[order(table(per1980_nc$entity),decreasing=TRUE)][1:1000]
per1970_nc <- subset(x1970_nc,entity_type=="PERSON")
p1970_nc <- table(per1970_nc$entity)[order(table(per1970_nc$entity),decreasing=TRUE)][1:1000]
per1950_nc <- subset(x1950_nc,entity_type=="PERSON")
p1950_nc <- table(per1950_nc$entity)[order(table(per1950_nc$entity),decreasing=TRUE)][1:1000]
per1875_nc <- subset(x1875_nc,entity_type=="PERSON")
p1875_nc <- table(per1875_nc$entity)[order(table(per1875_nc$entity),decreasing=TRUE)][1:1000]
per1825_nc <- subset(x1825_nc,entity_type=="PERSON")
p1825_nc <- table(per1825_nc$entity)[order(table(per1825_nc$entity),decreasing=TRUE)][1:1000]
per1700_nc <- subset(x1700_nc,entity_type=="PERSON")
p1700_nc <- table(per1700_nc$entity)[order(table(per1700_nc$entity),decreasing=TRUE)][1:1000]


per2010_nc <- subset(x2010_nc,entity_type=="PERSON")
p2010_nc <- table(per2010_nc$entity)[order(table(per2010_nc$entity),decreasing=TRUE)][1:1000]
per2000_nc <- subset(x2000_nc,entity_type=="PERSON")
p2000_nc <- table(per2000_nc$entity)[order(table(per2000_nc$entity),decreasing=TRUE)][1:1000]
per1990_nc <- subset(x1990_nc,entity_type=="PERSON")
p1990_nc <- table(per1990_nc$entity)[order(table(per1990_nc$entity),decreasing=TRUE)][1:1000]
per1980_nc <- subset(x1980_nc,entity_type=="PERSON")
p1980_nc <- table(per1980_nc$entity)[order(table(per1980_nc$entity),decreasing=TRUE)][1:1000]
per1970_nc <- subset(x1970_nc,entity_type=="PERSON")
p1970_nc <- table(per1970_nc$entity)[order(table(per1970_nc$entity),decreasing=TRUE)][1:1000]
per1950_nc <- subset(x1950_nc,entity_type=="PERSON")
p1950_nc <- table(per1950_nc$entity)[order(table(per1950_nc$entity),decreasing=TRUE)][1:1000]
per1875_nc <- subset(x1875_nc,entity_type=="PERSON")
p1875_nc <- table(per1875_nc$entity)[order(table(per1875_nc$entity),decreasing=TRUE)][1:1000]
per1825_nc <- subset(x1825_nc,entity_type=="PERSON")
p1825_nc <- table(per1825_nc$entity)[order(table(per1825_nc$entity),decreasing=TRUE)][1:1000]
per1700_nc <- subset(x1700_nc,entity_type=="PERSON")
p1700_nc <- table(per1700_nc$entity)[order(table(per1700_nc$entity),decreasing=TRUE)][1:1000]

GPE2010_nc <- subset(x2010_nc,entity_type=="GPE")
g2010_nc <- table(GPE2010_nc$entity)[order(table(GPE2010_nc$entity),decreasing=TRUE)][1:1000]
GPE2000_nc <- subset(x2000_nc,entity_type=="GPE")
g2000_nc <- table(GPE2000_nc$entity)[order(table(GPE2000_nc$entity),decreasing=TRUE)][1:1000]
GPE1990_nc <- subset(x1990_nc,entity_type=="GPE")
g1990_nc <- table(GPE1990_nc$entity)[order(table(GPE1990_nc$entity),decreasing=TRUE)][1:1000]
GPE1980_nc <- subset(x1980_nc,entity_type=="GPE")
g1980_nc <- table(GPE1980_nc$entity)[order(table(GPE1980_nc$entity),decreasing=TRUE)][1:1000]
GPE1970_nc <- subset(x1970_nc,entity_type=="GPE")
g1970_nc <- table(GPE1970_nc$entity)[order(table(GPE1970_nc$entity),decreasing=TRUE)][1:1000]
GPE1950_nc <- subset(x1950_nc,entity_type=="GPE")
g1950_nc <- table(GPE1950_nc$entity)[order(table(GPE1950_nc$entity),decreasing=TRUE)][1:1000]
GPE1875_nc <- subset(x1875_nc,entity_type=="GPE")
g1875_nc <- table(GPE1875_nc$entity)[order(table(GPE1875_nc$entity),decreasing=TRUE)][1:1000]
GPE1825_nc <- subset(x1825_nc,entity_type=="GPE")
g1825_nc <- table(GPE1825_nc$entity)[order(table(GPE1825_nc$entity),decreasing=TRUE)][1:1000]
GPE1700_nc <- subset(x1700_nc,entity_type=="GPE")
g1700_nc <- table(GPE1700_nc$entity)[order(table(GPE1700_nc$entity),decreasing=TRUE)][1:1000]


ORG2010_nc <- subset(x2010_nc,entity_type=="ORG")
o2010_nc <- table(ORG2010_nc$entity)[order(table(ORG2010_nc$entity),decreasing=TRUE)][1:1000]
ORG2000_nc <- subset(x2000_nc,entity_type=="ORG")
o2000_nc <- table(ORG2000_nc$entity)[order(table(ORG2000_nc$entity),decreasing=TRUE)][1:1000]
ORG1990_nc <- subset(x1990_nc,entity_type=="ORG")
o1990_nc <- table(ORG1990_nc$entity)[order(table(ORG1990_nc$entity),decreasing=TRUE)][1:1000]
ORG1980_nc <- subset(x1980_nc,entity_type=="ORG")
o1980_nc <- table(ORG1980_nc$entity)[order(table(ORG1980_nc$entity),decreasing=TRUE)][1:1000]
ORG1970_nc <- subset(x1970_nc,entity_type=="ORG")
o1970_nc <- table(ORG1970_nc$entity)[order(table(ORG1970_nc$entity),decreasing=TRUE)][1:1000]
ORG1950_nc <- subset(x1950_nc,entity_type=="ORG")
o1950_nc <- table(ORG1950_nc$entity)[order(table(ORG1950_nc$entity),decreasing=TRUE)][1:1000]
ORG1875_nc <- subset(x1875_nc,entity_type=="ORG")
o1875_nc <- table(ORG1875_nc$entity)[order(table(ORG1875_nc$entity),decreasing=TRUE)][1:1000]
ORG1825_nc <- subset(x1825_nc,entity_type=="ORG")
o1825_nc <- table(ORG1825_nc$entity)[order(table(ORG1825_nc$entity),decreasing=TRUE)][1:1000]
ORG1700_nc <- subset(x1700_nc,entity_type=="ORG")
o1700_nc <- table(ORG1700_nc$entity)[order(table(ORG1700_nc$entity),decreasing=TRUE)][1:1000]

NORP2010_nc <- subset(x2010_nc,entity_type=="NORP")
n2010_nc <- table(NORP2010_nc$entity)[order(table(NORP2010_nc$entity),decreasing=TRUE)][1:1000]
NORP2000_nc <- subset(x2000_nc,entity_type=="NORP")
n2000_nc <- table(NORP2000_nc$entity)[order(table(NORP2000_nc$entity),decreasing=TRUE)][1:1000]
NORP1990_nc <- subset(x1990_nc,entity_type=="NORP")
n1990_nc <- table(NORP1990_nc$entity)[order(table(NORP1990_nc$entity),decreasing=TRUE)][1:1000]
NORP1980_nc <- subset(x1980_nc,entity_type=="NORP")
n1980_nc <- table(NORP1980_nc$entity)[order(table(NORP1980_nc$entity),decreasing=TRUE)][1:1000]
NORP1970_nc <- subset(x1970_nc,entity_type=="NORP")
n1970_nc <- table(NORP1970_nc$entity)[order(table(NORP1970_nc$entity),decreasing=TRUE)][1:1000]
NORP1950_nc <- subset(x1950_nc,entity_type=="NORP")
n1950_nc <- table(NORP1950_nc$entity)[order(table(NORP1950_nc$entity),decreasing=TRUE)][1:1000]
NORP1875_nc <- subset(x1875_nc,entity_type=="NORP")
n1875_nc <- table(NORP1875_nc$entity)[order(table(NORP1875_nc$entity),decreasing=TRUE)][1:1000]
NORP1825_nc <- subset(x1825_nc,entity_type=="NORP")
n1825_nc <- table(NORP1825_nc$entity)[order(table(NORP1825_nc$entity),decreasing=TRUE)][1:1000]
NORP1700_nc <- subset(x1700_nc,entity_type=="NORP")
n1700_nc <- table(NORP1700_nc$entity)[order(table(NORP1700_nc$entity),decreasing=TRUE)][1:1000]

actor.gini <- data.frame(   period=c("2010-2014", "2000-2009", "1990-1999", "1980-1989", "1970-1979", "1950-1969", "1875-1949", "1825-1874", "1785-1824"),
                    gpe=c(gini(g2010_nc),gini(g2000_nc),gini(g1990_nc),gini(g1980_nc),gini(g1970_nc),gini(g1950_nc),gini(g1875_nc),gini(g1825_nc),gini(g1700_nc)),
                    person=c(gini(p2010_nc),gini(p2000_nc),gini(p1990_nc),gini(p1980_nc),gini(p1970_nc),gini(p1950_nc),gini(p1875_nc),gini(p1825_nc),gini(p1700_nc)),
                    org=c(gini(o2010_nc),gini(o2000_nc),gini(o1990_nc),gini(o1980_nc),gini(o1970_nc),gini(o1950_nc),gini(o1875_nc),gini(o1825_nc),gini(o1700_nc)),
                    norp=c(gini(n2010_nc),gini(n2000_nc),gini(n1990_nc),gini(n1980_nc),gini(n1970_nc),gini(n1950_nc),gini(n1875_nc),gini(n1825_nc),gini(n1700_nc)))
actor.gini_long <- pivot_longer(actor.gini,cols=c("gpe", "person", "org", "norp"))

ggplot(actor.gini_long,aes(x=period,y=value,shape=name,color=name,fill=name,group=name))+geom_point()+geom_line()+ylab("Gini coefficient")+xlab("Period")+ggtitle("Diversity of entities in newspaper texts.\nLower values indicate greater diversity.")

df.1700.ent <- data.frame(table(x1700_nc$entity))
df.1825.ent <- data.frame(table(x1825_nc$entity))
df.1875.ent <- data.frame(table(x1875_nc$entity))
df.1950.ent <- data.frame(table(x1950_nc$entity))
df.1970.ent <- data.frame(table(x1970_nc$entity))
df.1980.ent <- data.frame(table(x1980_nc$entity))
df.1990.ent <- data.frame(table(x1990_nc$entity))
df.2000.ent <- data.frame(table(x2000_nc$entity))
df.2010.ent <- data.frame(table(x2010_nc$entity))

df.1700_ord <- df.1700.ent[order(df.1700.ent$Freq,decreasing=TRUE),]
df.1825_ord <- df.1825.ent[order(df.1825.ent$Freq,decreasing=TRUE),]
df.1875_ord <- df.1875.ent[order(df.1875.ent$Freq,decreasing=TRUE),]
df.1950_ord <- df.1950.ent[order(df.1950.ent$Freq,decreasing=TRUE),]
df.1970_ord <- df.1970.ent[order(df.1970.ent$Freq,decreasing=TRUE),]
df.1980_ord <- df.1980.ent[order(df.1980.ent$Freq,decreasing=TRUE),]
df.1990_ord <- df.1990.ent[order(df.1990.ent$Freq,decreasing=TRUE),]
df.2000_ord <- df.2000.ent[order(df.2000.ent$Freq,decreasing=TRUE),]
df.2010_ord <- df.2010.ent[order(df.2010.ent$Freq,decreasing=TRUE),]

table(match(df.1700_ord$Var1,df.2010_ord$Var1)>0)

dford <- c("df.1700_ord", "df.1825_ord", "df.1875_ord", "df.1950_ord", "df.1970_ord",
        "df.1980_ord", "df.1990_ord", "df.2000_ord", "df.2010_ord")

# Overlap between the top 1000 entities and all entities
overlap_1000_to_all <- matrix(NA,ncol=9,nrow=9)
colnames(overlap_1000_to_all) <- dford
rownames(overlap_1000_to_all) <- dford

for (i in 1:9)
  {
  for (j in 1:9)
    {
    overlap_1000_to_all[i,j] <- table((get(dford[i])$Var1[1:1000])%in%(get(dford[j])$Var1))[2]
    }
}

# Overlap between the top 100 entities and all entities
overlap_100_to_all <- matrix(NA,ncol=9,nrow=9)
colnames(overlap_100_to_all) <- dford
rownames(overlap_100_to_all) <- dford

for (i in 1:9)
  {
  for (j in 1:9)
    {
    overlap_100_to_all[i,j] <- table((get(dford[i])$Var1[1:100])%in%(get(dford[j])$Var1))[2]
    }
}

# Overlap between the top 10 entities and all entities
overlap_10_to_all <- matrix(NA,ncol=9,nrow=9)
colnames(overlap_10_to_all) <- dford
rownames(overlap_10_to_all) <- dford

for (i in 1:9)
  {
  for (j in 1:9)
    {
    overlap_10_to_all[i,j] <- table((get(dford[i])$Var1[1:10])%in%(get(dford[j])$Var1))[2]
    }
}

# Overlap between the top 1000 entities and top 1000 entities
overlap_1000_to_1000 <- matrix(NA,ncol=9,nrow=9)
colnames(overlap_1000_to_1000) <- dford
rownames(overlap_1000_to_1000) <- dford

for (i in 1:9)
  {
  for (j in 1:9)
    {
    overlap_1000_to_1000[i,j] <- table((get(dford[i])$Var1[1:1000])%in%(get(dford[j])$Var1[1:1000]))[2]
    }
}

# Overlap between the top 100 entities and top 100 entities
overlap_100_to_100 <- matrix(NA,ncol=9,nrow=9)
colnames(overlap_100_to_100) <- dford
rownames(overlap_100_to_100) <- dford

for (i in 1:9)
  {
  for (j in 1:9)
    {
    overlap_100_to_100[i,j] <- table((get(dford[i])$Var1[1:100])%in%(get(dford[j])$Var1[1:100]))[2]
    }
}

# Overlap between the top 10 entities and top 10 entities
overlap_10_to_10 <- matrix(NA,ncol=9,nrow=9)
colnames(overlap_10_to_10) <- dford
rownames(overlap_10_to_10) <- dford

for (i in 1:9)
  {
  for (j in 1:9)
    {
    overlap_10_to_10[i,j] <- table((get(dford[i])$Var1[1:10])%in%(get(dford[j])$Var1[1:10]))[2]
    }
}

df.1700_ord$rank <- 1:dim(df.1700_ord)[1]
df.1700_ord$type <- NA
df.1700_ord$type[1:100] <- c(  "capital",NA,"language/people", "justice", "country",NA,"country", "legislative", "country", "capital",
                "country", "capital", "parliament",NA,NA,"country", "politician",NA,"country", "politician",NA,
                NA,"city", "city", "body", "region", "language/people", "country",NA,"country",NA,
                NA,"county", "person",NA,"street", "person",NA,"city", "military",NA,
                NA,"person", "capital", "executive", "region",NA,"person",NA,NA,NA,
                "executive", "county",NA,"person", "city",NA,"city", "county", "person",
                "person", "body", "executive", "country", "country",NA,"legislative", "place",NA,"city",
                "city", "person",NA,"executive",NA,NA,"executive",NA,"language/people", "language/people",
                NA,NA,NA,NA,"executive",NA,"place", "person", "city",NA,
                "religion", "city",NA,"society", "person", "body", "executive",NA,"service",NA)

df.1825_ord$type <- NA
df.1825_ord$rank <- 1:dim(df.1825_ord)[1]
df.1825_ord$type[1:100] <- c( "capital", "language/people", "justice", "country",NA,"country", "city", "language/people", "language/people", "legislative",
                    "country", "capital", "city", "street", "country",NA,NA,"city", "executive",NA,
                    "language/people",NA,"legislative", "language/people",NA,"country", "legislative", "executive",NA,NA,
                    NA,NA,"capital", "county", "county",NA,NA,NA,NA,NA,
                    "religion",NA,NA,"region",NA,NA,"language/people", "person",NA,"city",
                    "country",NA,"finance",NA,"service", "capital",NA,NA,"city",NA,
                    "legislative",NA,"executive", "language/people", "executive", "language/people",NA,"place", "religious", "country",
                    "person",NA,NA,"executive", "country", "ministry", "city",NA,"city", "country",
                    "ministry",NA,"country", "executive",NA,"language/people", "profession", "capital",NA,NA,
                    "city",NA,"country", "ministry", "party/camp", "city",NA,"person", "city", "executive")

df.1875_ord$type <- NA
df.1875_ord$rank <- 1:dim(df.1875_ord)[1]
df.1875_ord$type[1:100] <- c( "capital", "language/people", "country", "language/people", "country",NA,"person",NA,NA,"language/people",
                    "city", "city",NA,"legislative", "language/people", "country", "legislative", "justice", "language/people",NA,
                    "capital", "street", "city", "language/people", "country",NA,NA,NA,"county",NA,
                    NA,"executive", "country", "language/people",NA,NA,"capital", "city",NA,"city",
                    NA,"city",NA,"city", "city",NA,"language/people", "country",NA,"region",
                    "county", "body",NA,"country", "business", "city", "place",NA,"city", "business",
                    "country", "county", "profession", "legislative",NA,"country", "city", "religious",NA,"city",
                    NA,"language/people", "executive",NA,"place", "country",NA,NA,"country",NA,
                    "legislative", "country", "capital",NA,"language/people",NA,"country", "military",NA,"person",
                    "capital", "city", "media", "military",NA,"country",NA,"body", "city", "language/people")

df.1950_ord$type <- NA
df.1950_ord$rank <- 1:dim(df.1950_ord)[1]
df.1950_ord$type[1:100] <- c( "capital", "language/people", "media", "country", "capital", "language/people", "language/people", "country", "country", "country",
                    "language/people", "legislative", "language/people",NA,"region", "country", "city", "language/people", "country", "city",
                    "country", "country", "capital", "county", "business", "justice", "county", "country", "legislative", "government",
                    "country", "city", "federation",NA,"language/people",NA,"language/people", "media",NA,"country",
                    "language/people", "language/people", "legislative", "country", "county", "country",NA,NA,"executive", "party/camp",
                    "city", "federation", "city", "city", "politician", "language/people", "country", "country", "language/people", "country",
                    "language/people", "country", "language/people", "language/people", "military", "capital", "country",NA,"region", "city",
                    "country", "ministry", "country", "language/people", "city", "person",NA,"city", "country", "executive",
                    NA,"party/camp",NA,NA,"country", "business",NA,"city", "religious", "capital",
                    "city", "country", "country", "executive", "politician", "language/people", "party/camp", "country",NA,"country")



c(prop.table(table(df.1700_ord$Var1[1:1000]%in%df.2010_ord$Var1))[2],
prop.table(table(df.1825_ord$Var1[1:1000]%in%df.2010_ord$Var1))[2],
prop.table(table(df.1875_ord$Var1[1:1000]%in%df.2010_ord$Var1))[2],
prop.table(table(df.1950_ord$Var1[1:1000]%in%df.2010_ord$Var1))[2],
prop.table(table(df.1970_ord$Var1[1:1000]%in%df.2010_ord$Var1))[2],
prop.table(table(df.1980_ord$Var1[1:1000]%in%df.2010_ord$Var1))[2],
prop.table(table(df.1990_ord$Var1[1:1000]%in%df.2010_ord$Var1))[2],
prop.table(table(df.2000_ord$Var1[1:1000]%in%df.2010_ord$Var1))[2],
prop.table(table(df.2010_ord$Var1[1:1000]%in%df.2010_ord$Var1))[2])

prop.table(table(df.1700_ord$Var1%in%df.2000_ord$Var1))[2]
prop.table(table(df.1825_ord$Var1%in%df.2000_ord$Var1))[2]
prop.table(table(df.1875_ord$Var1%in%df.2000_ord$Var1))[2]
prop.table(table(df.1950_ord$Var1%in%df.2000_ord$Var1))[2]
prop.table(table(df.1970_ord$Var1%in%df.2000_ord$Var1))[2]
prop.table(table(df.1980_ord$Var1%in%df.2000_ord$Var1))[2]
prop.table(table(df.1990_ord$Var1%in%df.2000_ord$Var1))[2]
prop.table(table(df.2000_ord$Var1%in%df.2000_ord$Var1))[2]
prop.table(table(df.2010_ord$Var1%in%df.2000_ord$Var1))[2]

prop.table(table(df.1700_ord$Var1%in%df.1990_ord$Var1))[2]
prop.table(table(df.1825_ord$Var1%in%df.1990_ord$Var1))[2]
prop.table(table(df.1875_ord$Var1%in%df.1990_ord$Var1))[2]
prop.table(table(df.1950_ord$Var1%in%df.1990_ord$Var1))[2]
prop.table(table(df.1970_ord$Var1%in%df.1990_ord$Var1))[2]
prop.table(table(df.1980_ord$Var1%in%df.1990_ord$Var1))[2]
prop.table(table(df.1990_ord$Var1%in%df.1990_ord$Var1))[2]
prop.table(table(df.2000_ord$Var1%in%df.1990_ord$Var1))[2]
prop.table(table(df.2010_ord$Var1%in%df.1990_ord$Var1))[2]

prop.table(table(df.1700_ord$Var1%in%df.1980_ord$Var1))[2]
prop.table(table(df.1825_ord$Var1%in%df.1980_ord$Var1))[2]
prop.table(table(df.1875_ord$Var1%in%df.1980_ord$Var1))[2]
prop.table(table(df.1950_ord$Var1%in%df.1980_ord$Var1))[2]
prop.table(table(df.1970_ord$Var1%in%df.1980_ord$Var1))[2]
prop.table(table(df.1980_ord$Var1%in%df.1980_ord$Var1))[2]
prop.table(table(df.1990_ord$Var1%in%df.1980_ord$Var1))[2]
prop.table(table(df.2000_ord$Var1%in%df.1980_ord$Var1))[2]
prop.table(table(df.2010_ord$Var1%in%df.1980_ord$Var1))[2]

prop.table(table(df.1700_ord$Var1%in%df.1970_ord$Var1))[2]
prop.table(table(df.1825_ord$Var1%in%df.1970_ord$Var1))[2]
prop.table(table(df.1875_ord$Var1%in%df.1970_ord$Var1))[2]
prop.table(table(df.1950_ord$Var1%in%df.1970_ord$Var1))[2]
prop.table(table(df.1970_ord$Var1%in%df.1970_ord$Var1))[2]
prop.table(table(df.1980_ord$Var1%in%df.1970_ord$Var1))[2]
prop.table(table(df.1990_ord$Var1%in%df.1970_ord$Var1))[2]
prop.table(table(df.2000_ord$Var1%in%df.1970_ord$Var1))[2]
prop.table(table(df.2010_ord$Var1%in%df.1970_ord$Var1))[2]

prop.table(table(df.1700_ord$Var1%in%df.1950_ord$Var1))[2]
prop.table(table(df.1825_ord$Var1%in%df.1950_ord$Var1))[2]
prop.table(table(df.1875_ord$Var1%in%df.1950_ord$Var1))[2]
prop.table(table(df.1950_ord$Var1%in%df.1950_ord$Var1))[2]
prop.table(table(df.1970_ord$Var1%in%df.1950_ord$Var1))[2]
prop.table(table(df.1980_ord$Var1%in%df.1950_ord$Var1))[2]
prop.table(table(df.1990_ord$Var1%in%df.1950_ord$Var1))[2]
prop.table(table(df.2000_ord$Var1%in%df.1950_ord$Var1))[2]
prop.table(table(df.2010_ord$Var1%in%df.1950_ord$Var1))[2]

prop.table(table(df.1700_ord$Var1%in%df.1875_ord$Var1))[2]
prop.table(table(df.1825_ord$Var1%in%df.1875_ord$Var1))[2]
prop.table(table(df.1875_ord$Var1%in%df.1875_ord$Var1))[2]
prop.table(table(df.1950_ord$Var1%in%df.1875_ord$Var1))[2]
prop.table(table(df.1970_ord$Var1%in%df.1875_ord$Var1))[2]
prop.table(table(df.1980_ord$Var1%in%df.1875_ord$Var1))[2]
prop.table(table(df.1990_ord$Var1%in%df.1875_ord$Var1))[2]
prop.table(table(df.2000_ord$Var1%in%df.1875_ord$Var1))[2]
prop.table(table(df.2010_ord$Var1%in%df.1875_ord$Var1))[2]

prop.table(table(df.1700_ord$Var1%in%df.1825_ord$Var1))[2]
prop.table(table(df.1825_ord$Var1%in%df.1825_ord$Var1))[2]
prop.table(table(df.1875_ord$Var1%in%df.1825_ord$Var1))[2]
prop.table(table(df.1950_ord$Var1%in%df.1825_ord$Var1))[2]
prop.table(table(df.1970_ord$Var1%in%df.1825_ord$Var1))[2]
prop.table(table(df.1980_ord$Var1%in%df.1825_ord$Var1))[2]
prop.table(table(df.1990_ord$Var1%in%df.1825_ord$Var1))[2]
prop.table(table(df.2000_ord$Var1%in%df.1825_ord$Var1))[2]
prop.table(table(df.2010_ord$Var1%in%df.1825_ord$Var1))[2]

prop.table(table(df.1700_ord$Var1%in%df.1700_ord$Var1))[2]
prop.table(table(df.1825_ord$Var1%in%df.1700_ord$Var1))[2]
prop.table(table(df.1875_ord$Var1%in%df.1700_ord$Var1))[2]
prop.table(table(df.1950_ord$Var1%in%df.1700_ord$Var1))[2]
prop.table(table(df.1970_ord$Var1%in%df.1700_ord$Var1))[2]
prop.table(table(df.1980_ord$Var1%in%df.1700_ord$Var1))[2]
prop.table(table(df.1990_ord$Var1%in%df.1700_ord$Var1))[2]
prop.table(table(df.2000_ord$Var1%in%df.1700_ord$Var1))[2]
prop.table(table(df.2010_ord$Var1%in%df.1700_ord$Var1))[2]

ggplot(df.gini_long,aes(ymin=value-0.01,ymax=value,xmin=start,xmax=end,group=name,fill=name))+geom_rect()+facet_grid(.~corpus)+ylim(0,1)

ggplot(df.gini_long,aes(ymin=value-0.01,ymax=value,xmin=start,xmax=end,group=corpus,fill=corpus))+geom_rect()+facet_wrap(.~name)+ylim(0,1)

ggplot(df.gini_long,aes(ymin=1-(value-0.01),ymax=1-value,xmin=start,xmax=end,group=corpus,fill=corpus))+geom_rect()+facet_wrap(.~name)+ylim(0,1)

c1700s$doc_id <- paste0("text",1:dim(c1700s)[1])
c1800s$doc_id <- paste0("text",1:dim(c1800s)[1])
c1900s$doc_id <- paste0("text",1:dim(c1900s)[1])
c1925s$doc_id <- paste0("text",1:dim(c1925s)[1])
c1950s$doc_id <- paste0("text",1:dim(c1950s)[1])
c1970s$doc_id <- paste0("text",1:dim(c1970s)[1])
c1980s$doc_id <- paste0("text",1:dim(c1980s)[1])
c1990s$doc_id <- paste0("text",1:dim(c1990s)[1])
c2000s$doc_id <- paste0("text",1:dim(c2000s)[1])
c2010s$doc_id <- paste0("text",1:dim(c2010s)[1])
c2015s$doc_id <- paste0("text",1:dim(c2015s)[1])
c2018s$doc_id <- paste0("text",1:dim(c2018s)[1])

# t1800ss <- rbind(t1800s,t1825s,t1850s,t1875s)

x1700_nc$year <- t1700s[match(x1700_nc$doc_id,t1700s$doc_id),"year_num"]
# x1800$year <- t1800ss[match(x1800$doc_id,t1800ss$doc_id),"year_num"]
x1800_nc$year <- t1800s[match(x1800_nc$doc_id,t1800s$doc_id),"year_num"]
x1825_nc$year <- t1825s[match(x1825_nc$doc_id,t1825s$doc_id),"year_num"]
x1850_nc$year <- t1850s[match(x1850_nc$doc_id,t1850s$doc_id),"year_num"]
x1875_nc$year <- t1875s[match(x1875_nc$doc_id,t1875s$doc_id),"year_num"]
x1900_nc$year <- t1900s[match(x1900_nc$doc_id,t1900s$doc_id),"year_num"]
x1925_nc$year <- t1925s[match(x1925_nc$doc_id,t1925s$doc_id),"year_num"]
x1950_nc$year <- t1950s[match(x1950_nc$doc_id,t1950s$doc_id),"year_num"]
x1970_nc$year <- t1970s[match(x1970_nc$doc_id,t1970s$doc_id),"year_num"]
x1980_nc$year <- t1980s[match(x1980_nc$doc_id,t1980s$doc_id),"year_num"]
x1990_nc$year <- t1990s[match(x1990_nc$doc_id,t1990s$doc_id),"year_num"]
x2000_nc$year <- t2000s[match(x2000_nc$doc_id,t2000s$doc_id),"year_num"]
x2010_nc$year <- t2010s[match(x2010_nc$doc_id,t2010s$doc_id),"year_num"]
# x2015_nc$year <- t2015s[match(x2015_nc$doc_id,t2015s$doc_id),"year_num"]
# x2018_nc$year <- t2018s[match(x2018_nc$doc_id,t2018s$doc_id),"year_num"]

x1700$year <- c1700s[match(x1700$doc_id,c1700s$doc_id),"year_num"]
x1800$year <- c1800s[match(x1800$doc_id,c1800s$doc_id),"year_num"]
x1900$year <- c1900s[match(x1900$doc_id,c1900s$doc_id),"year_num"]
x1925$year <- c1925s[match(x1925$doc_id,c1925s$doc_id),"year_num"]
x1950$year <- c1950s[match(x1950$doc_id,c1950s$doc_id),"year_num"]
x1970$year <- c1970s[match(x1970$doc_id,c1970s$doc_id),"year_num"]
x1980$year <- c1980s[match(x1980$doc_id,c1980s$doc_id),"year_num"]
x1990$year <- c1990s[match(x1990$doc_id,c1990s$doc_id),"year_num"]
x2000$year <- c2000s[match(x2000$doc_id,c2000s$doc_id),"year_num"]
x2010$year <- c2010s[match(x2010$doc_id,c2010s$doc_id),"year_num"]
x2015$year <- c2015s[match(x2015$doc_id,c2015s$doc_id),"year_num"]
x2018$year <- c2018s[match(x2018$doc_id,c2018s$doc_id),"year_num"]


cc <- rbind(x1700,x1800,x1900,x1925,x1950,x1970,x1980,x1990,x2000,x2010,x2015,x2018)
dfrank <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)
dfrank.gpe <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)
dfrank.per <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)
dfrank.org <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)
dfrank.norp <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)
for (i in 1785:2020){
    xxi <- subset(cc,year==i)
    xxi.gpe <- subset(xxi,entity_type=="GPE")
    xxi.org <- subset(xxi,entity_type=="ORG")
    xxi.per <- subset(xxi,entity_type=="PERSON")
    xxi.norp <- subset(xxi,entity_type=="NORP")
    ranks <- min(1000,length(unique(xxi$entity)))
    ranks.gpe <- min(1000,length(unique(xxi.gpe$entity)))
    ranks.org <- min(1000,length(unique(xxi.org$entity)))
    ranks.per <- min(1000,length(unique(xxi.per$entity)))
    ranks.norp <- min(1000,length(unique(xxi.norp$entity)))
    rawranking <- table(xxi$entity)
    rawranking.gpe <- table(xxi.gpe$entity)
    rawranking.org <- table(xxi.org$entity)
    rawranking.per <- table(xxi.per$entity)
    rawranking.norp <- table(xxi.norp$entity)
    ordranking <- rawranking[order(rawranking,decreasing=TRUE)]
    ordranking.gpe <- rawranking.gpe[order(rawranking.gpe,decreasing=TRUE)]
    ordranking.org <- rawranking.org[order(rawranking.org,decreasing=TRUE)]
    ordranking.per <- rawranking.per[order(rawranking.per,decreasing=TRUE)]
    ordranking.norp <- rawranking.norp[order(rawranking.norp,decreasing=TRUE)]
    ranking <- ordranking[1:ranks]
    ranking.gpe <- ordranking.gpe[1:ranks.gpe]
    ranking.org <- ordranking.org[1:ranks.org]
    ranking.per <- ordranking.per[1:ranks.per]
    ranking.norp <- ordranking.norp[1:ranks.norp]
    if(!any(!is.na(ranking))){df.ranking <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)} else {
        df.ranking <- data.frame(entity=names(ranking),count=as.numeric(ranking),rank=1:length(ranking),year=i,type=NA)}
    dfrank <- rbind(dfrank,df.ranking)
    if(!any(!is.na(ranking.gpe))){df.ranking.gpe <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)} else {
        df.ranking.gpe <- data.frame(entity=names(ranking.gpe),count=as.numeric(ranking.gpe),rank=1:length(ranking.gpe),year=i,type="gpe")}
    dfrank.gpe <- rbind(dfrank.gpe,df.ranking.gpe)
    if(!any(!is.na(ranking.org))){df.ranking.org <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)} else {
        df.ranking.org <- data.frame(entity=names(ranking.org),count=as.numeric(ranking.org),rank=1:length(ranking.org),year=i,type="org")}
    dfrank.org <- rbind(dfrank.org,df.ranking.org)
    if(!any(!is.na(ranking.per))){df.ranking.per <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)} else {
        df.ranking.per <- data.frame(entity=names(ranking.per),count=as.numeric(ranking.per),rank=1:length(ranking.per),year=i,type="per")}
    dfrank.per <- rbind(dfrank.per,df.ranking.per)
    if(!any(!is.na(ranking.norp))){df.ranking.norp <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)} else {
        df.ranking.norp <- data.frame(entity=names(ranking.norp),count=as.numeric(ranking.norp),rank=1:length(ranking.norp),year=i,type="norp")}
    dfrank.norp <- rbind(dfrank.norp,df.ranking.norp)
    flush.console()
    print(i)
}

xx <- rbind(x1700_nc,x1800_nc,x1825_nc,x1850_nc,x1875_nc,x1900_nc,x1925_nc,x1950_nc,x1970_nc,x1980_nc,x1990_nc,x2000_nc,x2010_nc)
df.rank <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)
df.rank.gpe <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)
df.rank.org <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)
df.rank.per <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)
df.rank.norp <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)
for (i in 1785:2020){
    xxi <- subset(xx,year==i)
    xxi.gpe <- subset(xxi,entity_type=="GPE")
    xxi.org <- subset(xxi,entity_type=="ORG")
    xxi.per <- subset(xxi,entity_type=="PERSON")
    xxi.norp <- subset(xxi,entity_type=="NORP")
    ranks <- min(1000,length(unique(xxi$entity)))
    ranks.gpe <- min(1000,length(unique(xxi.gpe$entity)))
    ranks.org <- min(1000,length(unique(xxi.org$entity)))
    ranks.per <- min(1000,length(unique(xxi.per$entity)))
    ranks.norp <- min(1000,length(unique(xxi.norp$entity)))
    rawranking <- table(xxi$entity)
    rawranking.gpe <- table(xxi.gpe$entity)
    rawranking.org <- table(xxi.org$entity)
    rawranking.per <- table(xxi.per$entity)
    rawranking.norp <- table(xxi.norp$entity)
    ordranking <- rawranking[order(rawranking,decreasing=TRUE)]
    ordranking.gpe <- rawranking.gpe[order(rawranking.gpe,decreasing=TRUE)]
    ordranking.org <- rawranking.org[order(rawranking.org,decreasing=TRUE)]
    ordranking.per <- rawranking.per[order(rawranking.per,decreasing=TRUE)]
    ordranking.norp <- rawranking.norp[order(rawranking.norp,decreasing=TRUE)]
    ranking <- ordranking[1:ranks]
    ranking.gpe <- ordranking.gpe[1:ranks.gpe]
    ranking.org <- ordranking.org[1:ranks.org]
    ranking.per <- ordranking.per[1:ranks.per]
    ranking.norp <- ordranking.norp[1:ranks.norp]
    if(!any(!is.na(ranking))){df.ranking <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)} else {
        df.ranking <- data.frame(entity=names(ranking),count=as.numeric(ranking),rank=1:length(ranking),year=i,type=NA)}
    df.rank <- rbind(df.rank,df.ranking)
    if(!any(!is.na(ranking.gpe))){df.ranking.gpe <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)} else {
        df.ranking.gpe <- data.frame(entity=names(ranking.gpe),count=as.numeric(ranking.gpe),rank=1:length(ranking.gpe),year=i,type="gpe")}
    df.rank.gpe <- rbind(df.rank.gpe,df.ranking.gpe)
    if(!any(!is.na(ranking.org))){df.ranking.org <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)} else {
        df.ranking.org <- data.frame(entity=names(ranking.org),count=as.numeric(ranking.org),rank=1:length(ranking.org),year=i,type="org")}
    df.rank.org <- rbind(df.rank.org,df.ranking.org)
    if(!any(!is.na(ranking.per))){df.ranking.per <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)} else {
        df.ranking.per <- data.frame(entity=names(ranking.per),count=as.numeric(ranking.per),rank=1:length(ranking.per),year=i,type="per")}
    df.rank.per <- rbind(df.rank.per,df.ranking.per)
    if(!any(!is.na(ranking.norp))){df.ranking.norp <- data.frame(entity=NA,count=NA,rank=NA,year=NA,type=NA)} else {
        df.ranking.norp <- data.frame(entity=names(ranking.norp),count=as.numeric(ranking.norp),rank=1:length(ranking.norp),year=i,type="norp")}
    df.rank.norp <- rbind(df.rank.norp,df.ranking.norp)
    flush.console()
    print(i)
}

df.rank <- df.rank[!is.na(df.rank$rank),]

df.rank$ordrank <- c(rep(seq(1,1000,1),times=length(1785:2014)))
df.rank$entity_type <- xx[match(df.rank$entity,xx$entity),"entity_type"]

df.allrank <- rbind(df.rank.org,df.rank.per,df.rank.gpe,df.rank.norp)
df.allrank$corpus <- "routine"
cc.allrank <- rbind(dfrank.org,dfrank.per,dfrank.gpe,dfrank.norp)
cc.allrank$corpus <- "crisis"

allrank <- rbind(df.allrank,cc.allrank)

gini_year <- data.frame(tapply(df.rank$rank,df.rank$year,Gini))
gini_year$year <- as.numeric(rownames(gini_year))
gini_year$actor_gini <- tapply(df.rank$rank,df.rank$year,Gini)



gini_year_type <- data.frame(tapply(df.allrank$count,interaction(df.allrank$year,df.allrank$type),Gini))


gini_year_type$year <- as.numeric(stringr::str_extract(rownames(gini_year_type),pattern="[:digit:]{4,4}"))
gini_year_type$type <- (stringr::str_remove(rownames(gini_year_type),pattern="[:digit:]{4,4}\\."))
gini_year_type$actor_gini <- tapply(df.allrank$count,interaction(df.allrank$year,df.allrank$type),Gini)



gini_year_type_corpus <- data.frame(tapply(allrank$count,interaction(allrank$year,allrank$type,allrank$corpus),Gini))
gini_year_type_corpus$type <-  (stringr::str_extract(rownames(gini_year_type_corpus),pattern="\\.[:alpha:]{3,4}\\."))
gini_year_type_corpus$corpus <-  (stringr::str_extract(rownames(gini_year_type_corpus),pattern="crisis|routine"))
gini_year_type_corpus$year <- as.numeric(stringr::str_extract(rownames(gini_year_type_corpus),pattern="[:digit:]{4,4}"))
gini_year_type_corpus$actor_gini <- tapply(allrank$count,interaction(allrank$year,allrank$type,allrank$corpus),Gini)

summary(lm(actor_gini~type*(corpus),data=gini_year_type_corpus))
summary(lm(actor_gini~type*(corpus*year)+type*(corpus*I(year^2)),data=gini_year_type_corpus))

ggplot(gini_year,aes(y=1-actor_gini,x=year))+geom_point()+geom_smooth()

ggplot(gini_year_type,aes(y=1-actor_gini,x=year,color=type,fill=type,shape=type))+geom_point()+geom_smooth(span=0.25)+facet_wrap(type~.)+theme_bluewhite()

ggplot(subset(gini_year_type_corpus,!is.na(actor_gini)),aes(y=1-actor_gini,x=year,color=type,fill=type,shape=type))+geom_point()+geom_smooth(span=0.25)+facet_grid(type~corpus)+theme_bluewhite()

ggplot(subset(gini_year_type_corpus,!is.na(actor_gini)),aes(y=1-actor_gini,x=year,color=corpus,fill=corpus,shape=type))+geom_point()+geom_smooth(span=0.25)+facet_grid(type~.)+theme_bluewhite()+scale_color_viridis_d(option="inferno",begin=0,end=0.85)+scale_fill_viridis_d(option="inferno",begin=0,end=0.85)

wavecount_year <- data.frame(table(wd250x$year))
wd250x$volume2 <- wd250x$volume+wd250x$duration*((wd250x$baseline90+wd250x$baseline180+wd250x$baseline365+wd250x$baseline730+wd250x$baseline1825)/5)
wavevolume_year <- data.frame(cnw_articles=tapply(wd250x$volume2,wd250x$year,sum))
wavevolume_year$year <- as.numeric(rownames(wavevolume_year))

yeardat <- data.frame(year=1785:2020,
    times_total_coverage=thetimes$articles,
    times_routine_coverage=as.numeric(table(nc.r$year)[match(1785:2020,names(table(nc.r$year)))]),
    times_crisis_coverage=as.numeric(table(df.text$year)[match(1785:2020,names(table(df.text$year)))]),
    times_cnw_coverage=wavevolume_year[match(1785:2020,wavevolume_year$year),"cnw_articles"],
    times_cnw_count=wavecount_year[match(1785:2020,wavecount_year$Var1),"Freq"])
    
yeardat$times_gpe_diversity <- subset(gini_year_type_corpus,corpus=="routine" & type==".gpe.")$actor_gini[match(1785:2020,subset(gini_year_type_corpus,corpus=="routine" & type==".gpe.")$year)]

yeardat$times_per_diversity <- subset(gini_year_type_corpus,corpus=="routine" & type==".per.")$actor_gini[match(1785:2020,subset(gini_year_type_corpus,corpus=="routine" & type==".per.")$year)]

yeardat$times_org_diversity <- subset(gini_year_type_corpus,corpus=="routine" & type==".org.")$actor_gini[match(1785:2020,subset(gini_year_type_corpus,corpus=="routine" & type==".org.")$year)]

yeardat$times_norp_diversity <- subset(gini_year_type_corpus,corpus=="routine" & type==".norp.")$actor_gini[match(1785:2020,subset(gini_year_type_corpus,corpus=="routine" & type==".norp.")$year)]

yeardat$times_crisis_gpe_diversity <- subset(gini_year_type_corpus,corpus=="crisis" & type==".gpe.")$actor_gini[match(1785:2020,subset(gini_year_type_corpus,corpus=="crisis" & type==".gpe.")$year)]

yeardat$times_crisis_per_diversity <- subset(gini_year_type_corpus,corpus=="crisis" & type==".per.")$actor_gini[match(1785:2020,subset(gini_year_type_corpus,corpus=="crisis" & type==".per.")$year)]

yeardat$times_crisis_org_diversity <- subset(gini_year_type_corpus,corpus=="crisis" & type==".org.")$actor_gini[match(1785:2020,subset(gini_year_type_corpus,corpus=="crisis" & type==".org.")$year)]

yeardat$times_crisis_norp_diversity <- subset(gini_year_type_corpus,corpus=="crisis" & type==".norp.")$actor_gini[match(1785:2020,subset(gini_year_type_corpus,corpus=="crisis" & type==".norp.")$year)]

yeardat$times_cnw_coverage <- replace(yeardat$times_cnw_coverage,is.na(yeardat$times_cnw_coverage),0)
yeardat$times_cnw_coverage <- replace(yeardat$times_cnw_count,is.na(yeardat$times_cnw_count),0)

summary(lm(times_crisis_gpe_diversity~year+year*year+times_total_coverage+times_crisis_coverage+times_cnw_count+times_cnw_coverage+times_crisis_org_diversity+times_crisis_norp_diversity+times_crisis_per_diversity+times_norp_diversity+times_per_diversity+times_org_diversity+times_gpe_diversity,data=yeardat))

save(yeardat,file="yeardat.RData")
    
    ,
    times_gpe_diversity=subset(gini_year_type_corpus,corpus=="routine" & type==".gpe.")$actor_gini,
    times_per_diversity=subset(gini_year_type_corpus,corpus=="routine" & type==".per.")$actor_gini,
    times_norp_diversity=subset(gini_year_type_corpus,corpus=="routine" & type==".norp.")$actor_gini,
    times_org_diversity=subset(gini_year_type_corpus,corpus=="routine" & type==".org.")$actor_gini,
    times_crisis_gpe_diversity=subset(gini_year_type_corpus,corpus=="crisis" & type==".gpe.")$actor_gini,
    times_crisis_per_diversity=subset(gini_year_type_corpus,corpus=="crisis" & type==".per.")$actor_gini,
    times_crisis_norp_diversity=subset(gini_year_type_corpus,corpus=="crisis" & type==".norp.")$actor_gini,
    times_crisis_org_diversity=subset(gini_year_type_corpus,corpus=="crisis" & type==".org.")$actor_gini)
    
    times_crisis_topic_spectrum=,
    times_crisis_topic_diversity=,
    
        times_topic_spectrum=,times_topic_diversity=,


yeardat_r <- yeardat[30:230,c("times_crisis_norp_diversity", "times_crisis_gpe_diversity", "times_norp_diversity", "times_gpe_diversity")]

yeardat_rr <- yeardat[30:230,"year"]

VAR.GPE <- VAR(y=yeardat_r,type="both",lag.max=7)
IRF.GPE1 <- irf(VAR.GPE,impulse="times_norp_diversity",response="times_crisis_norp_diversity")
IRF.GPE2 <- irf(VAR.GPE,impulse="times_gpe_diversity",response="times_crisis_gpe_diversity")
IRF.GPE3 <- irf(VAR.GPE,impulse="times_crisis_gpe_diversity",response="times_crisis_norp_diversity")
IRF.GPE4 <- irf(VAR.GPE,impulse="times_crisis_norp_diversity",response="times_crisis_gpe_diversity")

yeardat_v <- yeardat[30:230,c("times_crisis_per_diversity", "times_crisis_org_diversity", "times_per_diversity", "times_org_diversity")]

yeardat_vv <- yeardat[30:230,"year"]

VAR.ORG <- VAR(y=yeardat_v,type="both",lag.max=7)
IRF.ORG1 <- irf(VAR.ORG,impulse="times_org_diversity",response="times_crisis_org_diversity")
IRF.ORG2 <- irf(VAR.ORG,impulse="times_per_diversity",response="times_crisis_per_diversity")
IRF.ORG3 <- irf(VAR.ORG,impulse="times_crisis_org_diversity",response="times_crisis_per_diversity")
IRF.ORG4 <- irf(VAR.ORG,impulse="times_crisis_per_diversity",response="times_crisis_org_diversity")

4.6 Automated Entity Classification—Text Collocations (KWIC)

Code
##################
###

library(rvest)
library(WikipediaR)
library(RCurl)
library(stringr)
library(stringdist)
library(pbapply)
library(quanteda)
library(matrixStats)
library(stm)

text_2000 <- aggregate(actors2000$token, by = list(actors2000$doc_id), FUN = "paste", collapse = " ")

sw <- c(
  stopwords("en"), "mr", "miss", "mrs", "sir", "said",
  "__", "___", "0", "00", "000", "01", "02", "03", "04", "05", "07", "0o", "0t", "1", "1,000",
  "1,500", "1.1", "10", "10,000", "100", "100,000", "1001", "101", "102", "103",
  "104", "105", "106", "107", "108", "109", "10s", "10th", "11", "110",
  "111", "112", "113", "114", "115", "116", "117", "118", "119", "11th",
  "12", "120", "121", "122", "123", "124", "125", "127", "12s", "12th",
  "13", "130", "131", "132", "133", "134", "135", "13s", "13th", "14",
  "140", "141", "145", "14s", "14th", "15", "150", "151", "155", "15s", "15th",
  "16", "160", "161", "16s", "16th", "17", "170", "171", "17s", "17th", "18", "180", "181", "1870",
  "18s", "18th", "19", "190", "191",
  "1910", "19th", "1a", "1b", "1c",
  "1d", "1e", "1f", "1h", "1i",
  "1l", "1m", "1n", "1o", "1r",
  "1s", "1st", "1t", "1th", "1v",
  "1w", "1y", "2", "2,000", "2.30",
  "20", "20,000", "200", "201", "20s",
  "20th", "21", "210", "211", "212",
  "21s", "21st", "22", "220", "22d",
  "23", "230", "23d", "24", "24s",
  "24th", "25", "250", "25s", "25th",
  "26", "26th", "27", "27th", "28",
  "28th", "29", "29th", "2a", "2d",
  "2i", "2l", "2nd", "2o", "2r",
  "2s", "2t", "3", "3,000", "30",
  "300", "30s", "30th", "31", "31r",
  "31st", "32", "33", "34", "35",
  "350", "36", "37", "38", "39",
  "3a", "3d", "3i", "3ir", "3l",
  "3mr", "3o", "3r", "3rd", "3s",
  "3t", "4", "4,000", "40", "400",
  "40s", "41", "42", "43", "44",
  "45", "46", "47", "48", "49",
  "4a", "4d", "4i", "4l", "4o",
  "4s", "4t", "4th", "5", "5,000",
  "50", "500", "51", "52", "53",
  "54", "55", "56", "57", "58",
  "59", "5d", "5i", "5s", "5t",
  "5th", "6", "60", "600", "61",
  "62", "63", "64", "65", "66",
  "67", "68", "69", "6d", "6f",
  "6i", "6s", "6t", "6th", "7",
  "7.30", "70", "700", "71", "72",
  "73", "74", "75", "76", "77",
  "78", "79", "7d", "7s", "7th",
  "8", "80", "800", "81", "82",
  "83", "84", "85", "86", "87",
  "88", "89", "8d", "8s", "8th",
  "9", "90", "91", "92", "93",
  "94", "95", "96", "97", "98",
  "99", "9d", "9s", "9th", "a",
  "â", "ã", "a.d", "a.m", "a1",
  "a2", "a3", "a4", "a5", "aa",
  "aâ", "aad", "ab", "oclock", "minut", "yard", "morn", "half-past", "dont", "don't", "say", "did", "didn't", "get", "want", "just", "think", "cant", "cannot", "theyr", "wasnt", "wouldnt", "wasn't", "wouldn't", "havent", "haven't", "shouldn't", "ing", "tion", "con", "com", "pro", "ã‚â£", "ã‚â£m", "â£m", "•", "â–º", "“the", "britain’", "¬", "â", "â€", "~", "ã", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "B", "C", "D", "E", "F", "G", "H", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "ll", "^", "th", "ot", "15", "20", "te", ">", "id", "il", "´", "`", "la", "ii", "en", "al", "tbe", "re", "lt", "li", "ar", "fo", "thie", "°", "aud", "le", "es", "ss", "od", "bo", "ir", "tho"
)

strt <- c("https://en.wikipedia.org/w/index.php?search=")
nd <- c("&title=Special%3ASearch&fulltext=1&ns0=1")

entities <- stringr::str_replace_all(names(orgs_o100), "\\_", "+")
entities <- stringr::str_replace_all(entities, "\n", "")

entities <- lapply(cc_orgs, first100names)

entvec <- unique(do.call(c, entities))

entvec <- entvec[!is.na(entvec)]

rentvec <- (str_replace_all(entvec, "_", " "))

phrasevec <- phrase(rentvec)

search <- paste0(strt, phrasevec, nd)

kwic_o1 <- (kwic(x = atok, pattern = entvec[1], window = 10, valuetype = "fixed", case_insensitive = TRUE))
kwic_o2 <- (kwic(x = atok, pattern = entvec[3353], window = 10, valuetype = "fixed", case_insensitive = TRUE))

a <- tokens_subset(atok, year > 2000)

(kwic(x = a, pattern = phrasevec[[14]], window = 10, valuetype = "fixed", case_insensitive = TRUE))

(kwic(x = a, pattern = phrasevec[23], window = 10, valuetype = "fixed", case_insensitive = TRUE))

sent <- tokens(cp, what = "sentence", remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE, split_hyphens = FALSE)



# suwo <- surround_words(pattern=entities[[1]],texts=tokens_subset(atok,year==2020))

# surround_words2 <- function(pattern,texts){
#   kwic_10 <- (kwic(x=texts,pattern=pattern,window=10,valuetype="fixed",case_insensitive=TRUE))
#   wordbag <- paste(unlist(kwic_10$pre),unlist(kwic_10$post))
#   dfm_wb <- dfm(tokens(wordbag))
#   dfm_wb %>% dfm_trim(min_termfreq=5) -> dfm_wbm
#   return(dfm_wbm)
#   }

suwo <- list()

for (i in 1:length(phrasevec))
{
  suwo[[i]] <- surround_words(pattern = phrasevec[i], texts = atok)
  print(i)
  flush.console()
}

suwo_x <- do.call(c, pblapply(suwo, paste, collapse = " "))

suwo_y <- do.call(c, suwo)

tok_suwo_x <- tokens(suwo_x, remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE, split_hyphens = FALSE)

tok_suwo_y <- tokens(suwo_y, remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE, split_hyphens = FALSE)

dfm_suwo_x <- dfm(tok_suwo_x)

dfm_suwo_y <- dfm(tok_suwo_y)

dfm_suwo_x %>%
  dfm_trim(min_docfreq = 10) %>%
  dfm_remove(pattern = sw) %>%
  convert(to = "stm") -> out_suwo_x
dfm_suwo_y %>%
  dfm_trim(min_docfreq = 10) %>%
  dfm_remove(pattern = sw) %>%
  convert(to = "stm") -> out_suwo_y

stm_suwo_x <- stm(
  documents = out_suwo_x$documents,
  vocab = out_suwo_x$vocab,
  K = 50,
  prevalence = ~1,
  max.em.its = 50,
  data = out_suwo_x$meta,
  init.type = "Spectral"
)

stm_suwo_x_120 <- stm(
  documents = out_suwo_x$documents,
  vocab = out_suwo_x$vocab,
  K = 120,
  prevalence = ~1,
  max.em.its = 50,
  data = out_suwo_x$meta,
  init.type = "Spectral"
)

aaa <- labelTopics(stm_suwo_x_120)[[1]]
bbb <- labelTopics(stm_suwo_x_120)[[2]]
ccc <- labelTopics(stm_suwo_x_120)[[3]]
ddd <- labelTopics(stm_suwo_x_120)[[4]]

topics <- data.frame(
  X = paste0("X", 1:120),
  hiprob = paste(aaa[, 1], aaa[, 2], aaa[, 3], aaa[, 4], aaa[, 5], aaa[, 6], aaa[, 7]),
  frex = paste(bbb[, 1], bbb[, 2], bbb[, 3], bbb[, 4], bbb[, 5], bbb[, 6], bbb[, 7]),
  lift = paste(ccc[, 1], ccc[, 2], ccc[, 3], ccc[, 4], ccc[, 5], ccc[, 6], ccc[, 7]),
  score = paste(ddd[, 1], ddd[, 2], ddd[, 3], ddd[, 4], ddd[, 5], ddd[, 6], ddd[, 7])
)

probs <- data.frame(stm_suwo_x_120$theta)

actornumbers <- str_extract(names(out_suwo_x$documents), "[:digit:]{1,4}")

probs$name <- entvec[as.numeric(actornumbers)]

rm.probs <- rowMaxs(as.matrix(probs[, 1:120]))

df_orgs <- data.frame(name = NA, X = NA, hiprob = NA, frex = NA, lift = NA, score = NA)

for (i in 1:length(rm.probs))
{
  coord <- which(probs[i, 1:120] == rm.probs[i])
  df_orgs[i, ] <- data.frame(name = probs[i, "name"], topics[coord, ])
}

subset(df_orgs, X == "X1")$name
subset(df_orgs, X == "X1")$score[1]

subset(df_orgs, X == "X2")$name
subset(df_orgs, X == "X2")$score[1]

subset(df_orgs, X == "X1")$name

### POLITICAL SYSTEM

i11 <- c("X3", "X11", "X15", "X17", "X18", "X32", "X37", "X40", "X44", "X53", "X54", "X56", "X88", "X92", "X94", "X100", "X105", "X109", "X117", "X119", "X120", "X93") # Executive, Ministry/Agency
i12 <- c("X11", "X40", "X47", "X54", "X88", "X119") # Parties
i13 <- c("X9", "X11", "X40", "X44", "X47") # Movements, civil society
i14 <- c("X14", "X22", "X32", "X86", "X90", "X119", "X120") # Legislative
# i15 <- c('','','','','','','','','','','','',) # Justice, court
i16 <- c("X9", "X50") # Officials, police, emergency services
i17 <- c("X87", "X114") # Monarchy, royal family, "Crown"
i18 <- c("X45", "X79", "X120") # CentralBanks
# i19 <- c('','','','','','','','','','','','',) # Other administrative

i21 <- c("X15", "X45", "X95", "X112") # Intergovernmental orgs
i22 <- c("X15", "X78", "X112") # Foreign affairs
i23 <- c("X61", "X66") # Supranational, EU
i24 <- c("X5", "X6", "X7", "X10", "X11", "X12", "X15", "X16", "X18", "X22", "X30", "X33", "X44", "X47", "X50", "X51", "X52", "X54", "X68", "X70", "X73", "X78", "X83", "X91", "X105", "X110", "X112") # Foreign countries, regions, cities
i25 <- c("X2", "X6", "X7", "X10", "X11", "X16", "X21", "X30", "X43", "X75", "X80", "X82", "X84", "X85", "X98", "X106", "X114") # Regions, Cities

i31 <- c("X58", "X4", "X64") # Mining, extraction, raw materials, agriculture
i32 <- c("X23", "X64", "X65") # Financial, insurance
i33 <- c("X23", "X36", "X57", "X108", "X101") # Services, including travel, transportation
i34 <- c("X2", "X49", "X57", "X58", "X101") # Manufacturing
i35 <- c("X116", "X71") # Trades, chambers of commerce, traders, economy, businesses (general, varied)
i36 <- c("X32", "X58") # Trade unions
i37 <- c("X36", "X85") # Tech companies

i41 <- c("X18", "X22", "X29", "X33", "X42", "X63", "X75", "X77", "X109", "X112", "X113", "X115") # Military, intelligence service
i42 <- c("X2", "X72") # Healthcare system
i43 <- c("X62") # Education, Research, Science
i44 <- c("X94") # Terrorist organizations

i51 <- c("X15", "X27", "X33", "X34", "X35", "X41", "X80", "X85", "X107") # Media
i52 <- c("X47", "X98") # Churches, Religious
i53 <- c("X6") # Sports
i54 <- c("X10", "X28", "X31", "X60", "X92", "X94") # Film, Theater, Arts, Music/Dance, Literature
i55 <- c("X37") # Polling, market research, social research, economic research
i56 <- c("X108", "X113") # Foundations, Philanthropy, welfare

i80 <- c("X10", "X5", "X12", "X15", "X25", "X29", "X35", "X34", "X38", "X39", "X40", "X43", "X48", "X81", "X83", "X115") # Citizens/names/titles

i90 <- c("X1", "X4", "X29") # French

i99 <- c("X8", "X13", "X15", "X19", "X20", "X24", "X26", "X39", "X41", "X42", "X46", "X53", "X55", "X56", "X59", "X67", "X69", "X74", "X76", "X89", "X96", "X97", "X99", "X102", "X103", "X111", "X118") # GIbberish


df_orgs$i <- Recode(df_orgs$X, " c('X3','X11','X15','X17','X18','X32','X37','X40','X44','X53','X54','X56','X88','X92','X94','X100','X105','X109','X117','X119','X120','X93')='i11';
            c('X11','X40','X47','X54','X88','X119')='i12';
            c('X9','X11','X40','X44','X47')='i13';
            c('X14','X22','X32','X86','X90','X119','X120')='i14';
            c('X9','X50')='i16';
            c('X87','X114')='i17';
            c('X45','X79','X120')='i18';
            c('X15','X45','X95','X112')='i21';
            c('X15','X78','X112')='i22';
            c('X61','X66')='i23';
            c('X5','X6','X7','X10','X11','X12','X15','X16','X18','X22','X30','X33','X44','X47','X50','X51','X52','X54','X68','X70','X73','X78','X83','X91','X105','X110','X112')='i24';
            c('X2','X6','X7','X10','X11','X16','X21','X30','X43','X75','X80','X82','X84',
            'X85','X98','X106','X114')='i25';

            c('X58','X4','X64')='i31';
            c('X23','X64','X65')='i32';
            c('X23','X36','X57','X108','X101')='i33';
            c('X2','X49','X57','X58','X101')='i34';
            c('X116','X71')='i35';
            c('X32','X58')='i36';
            c('X36','X85')='i37';

            c('X18','X22','X29','X33','X42','X63','X75','X77','X109','X112','X113','X115')='i41';
            c('X2','X72')='i42';
            c('X62')='i43';
            c('X94')='i44';

            c('X15','X27','X33','X34','X35','X41','X80','X85','X107')='i51';
            c('X6')='i53';
            c('X10','X28','X31','X60','X92','X94')='i54';
            c('X37')='i55';
            c('X108','X113')='i56';

            c('X10','X5','X12','X15','X25','X29','X35','X34','X38','X39','X40','X43',
            'X48','X81','X83','X115')='i80';

            c('X1','X4','X29')='i90';

            c('X8','X13','X15','X19','X20','X24','X26','X39','X41','X42','X46','X53','X55','X56','X59','X67','X69','X74','X76','X89','X96','X97','X99','X102','X103','X111','X118') ='i99'
          ")

df_orgs$prot <- factor(df_orgs$i, levels = c(
  "i11", "i12", "i13", "i14", "i16", "i17", "i18", "i21", "i22", "i23", "i24", "i25",
  "i31", "i32", "i33", "i34", "i35", "i36", "i37",
  "i41", "i42", "i43", "i44", "i51", "i52", "i53", "i54", "i55", "i56", "i80", "i90", "i99"
), labels = c(
  "Executive, Ministry/Agency", "Parties", "Movements, civil society", "Legislative", "Officials, police, emergency services", "Monarchy, royal family, Crown", "CentralBanks", "Intergovernmental orgs", "Foreign affairs", "Supranational, EU", "Foreign countries, regions, cities", "Regions, Cities",
  "Mining, extraction, raw materials, agriculture", "Financial, insurance", "Services, including travel, transportation", "Manufacturing", "Trades, chambers of commerce, traders, economy, businesses (general, varied)", "Trade unions", "Tech companies",
  "Military, Intelligence Service", "Healthcare System", "Education, Research, Science", "Terrorist Organizations", "Media", "Churches, Religious", "Sports", "Film, Theater, Dance, Arts, Crafts, Literature",
  "Polling, Market Research, Social Research, Economic Research", "Foundations, Philanthropy,Welfare",
  "Citizens, names, titles", "French", "Gibberish"
))

actor_dictionary <- dictionary(list(
  Executive = subset(df_orgs, i == "i11")$name,
  Parties = subset(df_orgs, i == "i12")$name,
  Movements = subset(df_orgs, i == "i13")$name,
  Legislative = subset(df_orgs, i == "i14")$name,
  Authorities = subset(df_orgs, i == "i16")$name,
  RoyalFamily = subset(df_orgs, i == "i17")$name,
  CentralBanks = subset(df_orgs, i == "i18")$name,
  IntergovernmentalOrgs = subset(df_orgs, i == "i21")$name,
  ForeignAffairs = subset(df_orgs, i == "i22")$name,
  Supranational = subset(df_orgs, i == "i23")$name,
  ForeignCountries = subset(df_orgs, i == "i24")$name,
  RegionsCities = subset(df_orgs, i == "i25")$name,
  EcoExtraction = subset(df_orgs, i == "i31")$name,
  EcoFinancial = subset(df_orgs, i == "i32")$name,
  EcoServices = subset(df_orgs, i == "i33")$name,
  EcoIndustry = subset(df_orgs, i == "i34")$name,
  EcoTreade = subset(df_orgs, i == "i35")$name,
  EcoTradeUnions = subset(df_orgs, i == "i36")$name,
  EcoTech = subset(df_orgs, i == "i37")$name,
  Military = subset(df_orgs, i == "i41")$name,
  Health = subset(df_orgs, i == "i42")$name,
  EducationResearch = subset(df_orgs, i == "i43")$name,
  Terrorists = subset(df_orgs, i == "i44")$name,
  Media = subset(df_orgs, i == "i51")$name,
  Churches = subset(df_orgs, i == "i52")$name,
  Sports = subset(df_orgs, i == "i53")$name,
  ArtLiteratureFilm = subset(df_orgs, i == "i54")$name,
  PollingSocialResearch = subset(df_orgs, i == "i55")$name,
  FoundationsPhilanthropy = subset(df_orgs, i == "i56")$name,
  CitizensNamesTitles = subset(df_orgs, i == "i80")$name,
  French = subset(df_orgs, i == "i90")$name,
  Gibberish = subset(df_orgs, i == "i99")$name
))

tok_actor <- tokens_lookup(x = cc.tokens, dictionary = actor_dictionary, nomatch = "UNMATCHED")



actor.data <- data.frame(
  id = docvars(cc.tokens)$id,
  date = docvars(cc.tokens)$date,
  time = docvars(cc.tokens)$time,
  Time = docvars(cc.tokens)$Time,
  headline = docvars(cc.tokens)$headline,
  year = docvars(cc.tokens)$year,
  Executive = unlist(lapply(tok_actor, sumy, y = "Executive")),
  UNMATCHED = unlist(lapply(tok_actor, sumy, y = "UNMATCHED")),
  wordcount = unlist(lapply(tok_actor, length))
)

actor.data$Executive_rate <- 1000 * actor.data$Executive / actor.data$wordcount

# ggplot(actor.data,aes(y=Executive_rate,x=as.numeric(year)))+geom_smooth()

### POLITICAL SYSTEM

i41 <- c("X18", "X22", "X29", "X33", "X42", "X63", "X75", "X77", "X109", "X112", "X113", "X115") # Military, intelligence service
i42 <- c("X2", "X72") # Healthcare system
i43 <- c("X62") # Education, Research, Science
i44 <- c("X94") # Terrorist organizations

i51 <- c("X15", "X27", "X33", "X34", "X35", "X41", "X80", "X85", "X107") # Media
i52 <- c("X47", "X98") # Churches, Religious
i53 <- c("X6") # Sports
i54 <- c("X10", "X28", "X31", "X60", "X92", "X94") # Film, Theater, Arts, Music/Dance, Literature
i55 <- c("X37") # Polling, market research, social research, economic research
i56 <- c("X108", "X113") # Foundations, Philanthropy, welfare

i80 <- c("X10", "X5", "X12", "X15", "X25", "X29", "X35", "X34", "X38", "X39", "X40", "X43", "X48", "X81", "X83", "X115") # Citizens/names/titles

i90 <- c("X1", "X4", "X29") # French

i99 <- c("X8", "X13", "X15", "X19", "X20", "X24", "X26", "X39", "X41", "X42", "X46", "X53", "X55", "X56", "X59", "X67", "X69", "X74", "X76", "X89", "X96", "X97", "X99", "X102", "X103", "X111", "X118") # GIbberish

ii <- c(i11, i12, i13, i14, i16, i17, i18, i21, i22, i23, i24, i25, i31, i32, i33, i34, i35, i36, i37, i41, i42, i43, i44, i51, i52, i53, i54, i55, i56, i80, i90, i99)

iii <- table(ii)

probs$i11 <- rowSums(probs[, i11] / iii[match(i11, names(iii))], na.rm = TRUE)
probs$i12 <- rowSums(probs[, i12] / iii[match(i12, names(iii))], na.rm = TRUE)
probs$i13 <- rowSums(probs[, i13] / iii[match(i13, names(iii))], na.rm = TRUE)
probs$i14 <- rowSums(probs[, i14] / iii[match(i14, names(iii))], na.rm = TRUE)
# probs$i15 <- rowSums(probs[,i15]/iii[match(i15,names(iii))],na.rm=TRUE)
probs$i16 <- rowSums(probs[, i16] / iii[match(i16, names(iii))], na.rm = TRUE)
probs$i17 <- rowSums(probs[, i17] / iii[match(i17, names(iii))], na.rm = TRUE)
probs$i18 <- rowSums(probs[, i18] / iii[match(i18, names(iii))], na.rm = TRUE)

probs$i21 <- rowSums(probs[, i21] / iii[match(i21, names(iii))], na.rm = TRUE)
probs$i22 <- rowSums(probs[, i22] / iii[match(i22, names(iii))], na.rm = TRUE)
probs$i23 <- rowSums(probs[, i23] / iii[match(i23, names(iii))], na.rm = TRUE)
probs$i24 <- rowSums(probs[, i24] / iii[match(i24, names(iii))], na.rm = TRUE)
probs$i25 <- rowSums(probs[, i25] / iii[match(i25, names(iii))], na.rm = TRUE)

probs$i31 <- rowSums(probs[, i31] / iii[match(i31, names(iii))], na.rm = TRUE)
probs$i32 <- rowSums(probs[, i32] / iii[match(i32, names(iii))], na.rm = TRUE)
probs$i33 <- rowSums(probs[, i33] / iii[match(i33, names(iii))], na.rm = TRUE)
probs$i34 <- rowSums(probs[, i34] / iii[match(i34, names(iii))], na.rm = TRUE)
probs$i35 <- rowSums(probs[, i35] / iii[match(i35, names(iii))], na.rm = TRUE)
probs$i36 <- rowSums(probs[, i36] / iii[match(i36, names(iii))], na.rm = TRUE)
probs$i37 <- rowSums(probs[, i37] / iii[match(i37, names(iii))], na.rm = TRUE)

probs$i41 <- rowSums(probs[, i41] / iii[match(i41, names(iii))], na.rm = TRUE)
probs$i42 <- rowSums(probs[, i42] / iii[match(i42, names(iii))], na.rm = TRUE)
probs$i43 <- (probs[, i43] / iii[match(i43, names(iii))])
probs$i44 <- (probs[, i44] / iii[match(i44, names(iii))])

probs$i51 <- rowSums(probs[, i51] / iii[match(i51, names(iii))], na.rm = TRUE)
probs$i52 <- rowSums(probs[, i52] / iii[match(i52, names(iii))], na.rm = TRUE)
probs$i53 <- (probs[, i53] / iii[match(i53, names(iii))])
probs$i54 <- rowSums(probs[, i54] / iii[match(i54, names(iii))], na.rm = TRUE)
probs$i55 <- (probs[, i55] / iii[match(i55, names(iii))])
probs$i56 <- rowSums(probs[, i56] / iii[match(i56, names(iii))], na.rm = TRUE)

probs$i80 <- rowSums(probs[, i80] / iii[match(i80, names(iii))], na.rm = TRUE)

probs$i90 <- rowSums(probs[, i90] / iii[match(i90, names(iii))], na.rm = TRUE)
probs$i99 <- rowSums(probs[, i99] / iii[match(i99, names(iii))], na.rm = TRUE)


iprobs <- as.matrix(probs[, 122:153])

for (i in 1:dim(iprobs)[1])
{
  probs[i, "i"] <- names(which(iprobs[i, ] == max(iprobs[i, ])))
}

probs$i <- names(probs)[which(probs[1, ]) == rowMaxs(probs[1, 122:153])]

coded_orgs <- lapply(cc_orgs, str_replace_all, pattern = probs)

for (i in 1785:2020)
{
  orgs <- cc_orgs[[i]]
}

subset(df_orgs, X == "X1")$name

# Gibberish:
x99 <- c("X1", "X8", "X13", "X19", "X20", "X23", "X24", "X27", "X28", "X34", "X43", "X50", "X56", "X59", "X62", "X67", "X69", "X72", "X76", "X83", "X89", "X94", "X96", "X97", "X99", "X103", "X104", "X111", "X112", "X118")

# Citizens:
x90 <- c("X10", "X12", "X15", "X16", "X17", "X25", "X35", "X37", "X38", "X48", "X53", "X81", "X82", "X88", "X115")

# Municipalities:
x19 <- c("X2", "X4", "X5", "X7", "X11", "X12", "X15", "X21", "X52", "X75", "X80", "X84", "X106", "X114")
# Executive',Ministry:
x11 <- c("X12", "X37", "X38", "X53", "X55", "X66", "X81", "X92", "X100", "X105", "X117", "X88")
# Executive',Treasury:
x12 <- c("X3", "X120")
# Legislative:
x14 <- c("X14", "X32", "X86", "X90")
# Monarchy:
x13 <- c("X87")
# Movements',Parties:
x15 <- c("X9", "X40", "X47", "X119")
# Foreign:
x21 <- c("X22", "X33", "X44", "X47", "X54", "X68", "X70", "X73", "X78", "X91", "X110", "X15")
# Supranational:
x22 <- c("X61", "X66")
# Intergovernmental:
x23 <- c("X15", "X95", "X120")
# CentralBanks:
x16 <- c("X45", "X79")
# Empire:
x17 <- c("X93")

# Extraction',agricult
x31 <- c("X39", "X58", "X116")
# Service:
x32 <- c("X23", "X36", "X57", "X64")
# Manufacturing
x33 <- c("X49", "X101", "X116")
# Financial
x34 <- c("X41", "X65")
# IT:
x35 <- c("X36")
# Unions:
x36 <- c("X46")
# Chambers:
x37 <- c("X71")
# Business',stocks
x38 <- c("X102", "X108")

# Science and Tech:
x39 <- c("X50")

# Military:
x41 <- c("X12", "X29", "X42", "X63", "X75", "X77", "X109", "X112", "X113")
# Terrorism',counter:
x42 <- c("X18", "X47")
# Transportation:
x43 <- c("X30", "X102")
# Religion:
x44 <- c("X26", "X47", "X98")

# Media:
x51 <- c("X11", "X74", "X85", "X107", "X35")
# Sports',Soccer:
x52 <- c("X6")
# Theatre',Cinema:
x53 <- c("X31", "X57", "X60")



stm_suwo_y <- stm(
  documents = out_suwo_y$documents,
  vocab = out_suwo_y$vocab,
  K = 50,
  prevalence = ~1,
  max.em.its = 50,
  data = out_suwo_y$meta,
  init.type = "Spectral"
)


stm_suwo_y <- stm(out_suwo_y)

#  stm(documents = out$documents, vocab = out$vocab,
#    K = 20, prevalence =~ rating + s(day),
#    max.em.its = 75, data = out$meta,
#    init.type = "Spectral")

get_knowledge(x = search[3])

df_srch <- list()

search_i <- str_remove_all(search, "[^A-Za-z0-9\\_\\=\\+]")

for (i in 1:length(search_i))
{
  df_srch[[1]] <- get_knowledge(search_i[i])
  print(i)
  flush.console()
}

pblapply(search, FUN = get_knowledge)

s1 <- session(search[[1]])

ts <- rvest::read_html(search[[1]])
# html_elements(ts)

# session_follow_link(s1, i=1)
# session_follow_link(s1, css="div.mw-search-result-heading")

# , xpath="div.mw-search-result-heading")


# search[[1]] %>% html %>% html_nodes("table")

org_sample <- lapply(cc_orgs[1785:2020], FUN = try(sampler), size = 10)

c_org_sample <- unlist(org_sample)

yearly_sample_size <- unlist(lapply(org_sample, length))

df_org_sample <- data.frame(org = names(c_org_sample), freq = c_org_sample, year = rep(1785:2020, times = yearly_sample_size))

organization_composition_data <- data.frame(
  year = c(1785:2020),
  category = c(
    rep(0, times = 25), 230, 0, 111, 230, 230, 230, 0, 230, 150, 230, 230, 0, 230, 230, 130, 230, 132, 150, 9000, 1410, 1230, 230, 230, 230, 230, 230, 230, 230, 230, 230, 230, 230, 132, 230, 230, 230, 150, 140, 1530, 700, 111, # 1850
    131, 430, 132, 1530, 1427, 230, 132, 132, 1410, 132, 230, 1230, 131, 1410, 1427, 1530, 1530, 131, 1530, 150, 230, 1530, 133, 131, 330, 1427, 230, 230, 1700, 131, 1410, 1530, 230, 115, 800, 230, 230, 130, 1530, 1422, 132, 230, 131, 1411, 1530, 230, 430, 1530, 230, # 1900
    1700, 230, 1530, 9000, 1417, 1530, 1427, 1700, 132, 1230, 1230, 1530, 133, 1427, 133, 132, 1700, 132, 1230, 132, 133, 1700, 1610, 1530, 1230, 800, 132, 111, 1300, 1530, 1530, 230, 1411, 700, 1530, 132, 1415, 133, 230, 430, 230, 230, 1417, 130, 1415, 1530, 1700, 1412, 110, 130, 1427, # 1950
    1700, 132, 132, 230, 110, 132, 132, 1230, 133, 110, 1530, 1410, 1300, 132, 1413, 1530, 510, 1530, 1415, 230, 1100, 1300, 230, 1412, 510, 132, 1530, 110, 140, 111, 1427, 1427, 810, 230, 510, 800, 810, 1100, 1610, 720, 400, 330, 1415, 150, 1410, 1415, 510, 132, 112, 1427, # 2000
    1400, 320, 111, 130, 820, 1300, 510, 1416, 1427, 1417, 1427, 150, 120, 1415, 600, 510, 820, 1427, 1000, 820
  )
)

ggplot(organization_composition_data, aes(xmin = year, xmax = year + 1, ymin = category, ymax = category + 1)) +
  geom_rect()

organization_composition_wdata2 <- pivot_wider(organization_composition_data, id_cols = year, names_from = category, values_from = category)


organization_composition_wdata <- pivot_wider(organization_composition_data, id_cols = year, names_from = category, values_from = category)

organization_composition_wdata[, 2:43] <- replace(organization_composition_wdata[, 2:43], is.na(organization_composition_wdata[, 2:43]), 0)

organization_composition_wdata[, 2:43] <- replace(organization_composition_wdata[, 2:43], (organization_composition_wdata[, 2:43]) > 0, 1)

for (i in 2:236) {
  for (j in 2:43) {
    organization_composition_wdata[i, j] <- ifelse(organization_composition_wdata[i, j] == 1, organization_composition_wdata[i - 1, j] + 1,
      ifelse(organization_composition_wdata[i - 1, j] > 0.09, organization_composition_wdata[i - 1, j] - 0.25, 0)
    )
  }
}

organization_composition_wdata$Gini <- NA

for (i in 1:236)
{
  organization_composition_wdata[i, "Gini"] <- Gini(organization_composition_wdata[i, 2:43])
}

gg_org_comp <- ggplot(organization_composition_wdata, aes(x = year, y = 1 - Gini)) +
  geom_point() +
  geom_smooth(span = .25) +
  theme_soft() +
  xlab("Year") +
  ylab("Diversity of actors (1-Gini)") +
  scale_x_continuous(breaks = seq(1780, 2020, 20))

# ggsave(gg_org_comp,file="org_comp.png",unit="cm",width=16,height=8,dpi=1200,scale=1.25)

1 - Gini(colSums(organization_composition_wdata2[organization_composition_wdata2$year < 1850, 2:43], na.rm = TRUE))
1 - Gini(colSums(organization_composition_wdata2[organization_composition_wdata2$year < 1900 & organization_composition_wdata2$year > 1849, 2:43], na.rm = TRUE))
1 - Gini(colSums(organization_composition_wdata2[organization_composition_wdata2$year < 1950 & organization_composition_wdata2$year > 1899, 2:43], na.rm = TRUE))
1 - Gini(colSums(organization_composition_wdata2[organization_composition_wdata2$year < 2000 & organization_composition_wdata2$year > 1949, 2:43], na.rm = TRUE))
Gini(colSums(organization_composition_wdata2[organization_composition_wdata2$year < 2021 & organization_composition_wdata2$year > 1969, 2:43], na.rm = TRUE))

4.7 Automated Entity Classification—Wikipedia

Code
##################
###

strt <- c("https://en.wikipedia.org/w/index.php?search=")
nd <- c("&title=Special%3ASearch&fulltext=1&ns0=1")

entities <- lapply(cc_orgs, first100names)

entvec <- unique(do.call(c, entities))

entvec <- entvec[!is.na(entvec)]

rentvec <- (str_replace_all(entvec, "_", " "))

phrasevec <- phrase(rentvec)

phrasevec2 <- lapply(phrasevec, paste0, collapse = "_")

search <- paste0(strt, phrasevec2, nd)

kwic_o1 <- (kwic(x = atok, pattern = names(orgs_o100)[4], window = 10, valuetype = "fixed", case_insensitive = TRUE))

sur100 <- pblapply(org100, surround_words, texts = atok)


dfm_wb <- dfm(tokens(wordbag))
dfm_wb %>% dfm_trim(min_termfreq = 5) -> dfm_wbm
#   return(dfm_wbm)
#   }




li_srch <- list()

for (i in 1:3359)
{
  li_srch[[i]] <- get_knowledge(search[i])
  print(i)
  flush.console()
}


df_srch <- data.frame(do.call(rbind, li_srch))

tk_srch <- tokens(df_srch$content, remove_punct = TRUE)
docvars(tk_srch) <- df_srch[, c(1, 2, 4, 5, 6, 7, 8)]

qdfm_srch <- dfm(tk_srch) %>%
  dfm_remove(stopwords("english")) %>%
  dfm_trim(min_docfreq = 0.01, docfreq_type = "prop")

sdfm_srch <- convert(qdfm_srch, to = "stm")

stm50_check <- searchK(documents = sdfm_srch$documents, vocab = sdfm_srch$vocab, K = seq(10, 100, 10), emtol = 0.01)


stm50_fit <- stm(sdfm_srch, K = 50, emtol = 0.01)
stm50_fit <- stm(sdfm_srch, K = 50, emtol = 0.01)
stm50_fit <- stm(sdfm_srch, K = 50, emtol = 0.01)
stm50_fit <- stm(sdfm_srch, K = 50, emtol = 0.01)
stm50_fit <- stm(sdfm_srch, K = 50, emtol = 0.01)

s1 <- session(search[[1]])

ts <- rvest::read_html(search[[1]])
html_elements(ts)

session_follow_link(s1, i = 1)
session_follow_link(s1, css = "div.mw-search-result-heading")

# , xpath="div.mw-search-result-heading")


search[[1]] %>%
  html() %>%
  html_nodes("table")

5 Real-world Data

5.1 UK Trade volume

Code
(subset(wwmetrics::trade, origin == "GBR"))$exports

# Imports to GB from other countries
(subset(wwmetrics::trade, destination == "GBR"))$exports # Exports by GB to other countries

trade_to_uk <- (subset(wwmetrics::trade, destination == "GBR"))
trade_from_uk <- (subset(wwmetrics::trade, origin == "GBR"))

5.2 UK Budget Statistics

Code
#########################
#########################
#########################
#########################
#### Government spending

spend <- read.csv2(".//GovernmentSpendingUK.csv")
names(spend)[1] <- c("Year")
spend$Pop <- as.numeric(spend$Pop)
spend$nGDP <- as.numeric(gsub("\\.|\\,", "", spend$Nominal.GDP))
spend$nGDP[75:230] <- spend$nGDP[75:230] / 100
spend$rGDP <- as.numeric(gsub("\\,", "", spend$Real.GDP))
spend$Spending <- as.numeric(spend$Spending)
# spend$SUMME <- rowSums(spend[,84:101])
spend$Spending2 <- as.numeric(spend$SUMME)

# Total spending per domain
spend$Economy <- rowSums(sapply(spend[, c(55, 60, 61, 62)], as.numeric), na.rm = T)
spend$Agriculture <- (sapply(spend[, 56], as.numeric))
spend$Resources <- rowSums(sapply(spend[, c(57, 58)], as.numeric), na.rm = T)
spend$Pensions <- rowSums(sapply(spend[, 4:7], as.numeric), na.rm = T)
spend$Health <- rowSums(sapply(spend[, 8:11], as.numeric), na.rm = T)
spend$Education <- rowSums(sapply(spend[, c(12:15, 54)], as.numeric), na.rm = T)
spend$Military <- rowSums(sapply(spend[, c(16:18, 20, 21)], as.numeric), na.rm = T)
spend$EconomicAid <- (sapply(spend[, 19], as.numeric))
spend$Family <- (sapply(spend[, 22], as.numeric))
spend$Unemployment <- (sapply(spend[, 23], as.numeric))
spend$Transport <- (sapply(spend[, 34], as.numeric))
spend$Religion <- (sapply(spend[, 53], as.numeric))
spend$Housing <- rowSums(sapply(spend[, c(24, 44, 45, 46, 47, 48, 49)], as.numeric), na.rm = T)
spend$Recreation <- rowSums(sapply(spend[, 50:52], as.numeric), na.rm = T)
spend$Environment <- rowSums(sapply(spend[, c(39, 40, 41, 42, 43)], as.numeric), na.rm = T)
spend$Government <- rowSums(sapply(spend[, c(35:38, 59)], as.numeric), na.rm = T)
spend$Welfare <- rowSums(sapply(spend[, c(25, 26, 27)], as.numeric), na.rm = T)
spend$Security <- rowSums(sapply(spend[, c(28, 29, 30, 31, 32, 33)], as.numeric), na.rm = T)

# Domain's share of total spending
spend$Economy_Spending <- spend$Economy / spend$Spending2
spend$Agriculture_Spending <- spend$Agriculture / spend$Spending2
spend$Resources_Spending <- spend$Resources / spend$Spending2
spend$Pensions_Spending <- spend$Pensions / spend$Spending2
spend$Health_Spending <- spend$Health / spend$Spending2
spend$Education_Spending <- spend$Education / spend$Spending2
spend$Military_Spending <- spend$Military / spend$Spending2
spend$Family_Spending <- spend$Family / spend$Spending2
spend$Unemployment_Spending <- spend$Unemployment / spend$Spending2
spend$Transport_Spending <- spend$Transport / spend$Spending2
spend$Religion_Spending <- spend$Religion / spend$Spending2
spend$Housing_Spending <- spend$Housing / spend$Spending2
spend$Recreation_Spending <- spend$Recreation / spend$Spending2
spend$Environment_Spending <- spend$Environment / spend$Spending2
spend$Government_Spending <- spend$Government / spend$Spending2
spend$Welfare_Spending <- spend$Welfare / spend$Spending2
spend$Security_Spending <- spend$Security / spend$Spending2
spend$EconomicAid_Spending <- spend$EconomicAid / spend$Spending2

# Domain spending: share of GDP
spend$Economy_nGDP <- spend$Economy / spend$nGDP
spend$Agriculture_nGDP <- spend$Agriculture / spend$nGDP
spend$Resources_nGDP <- spend$Resources / spend$nGDP
spend$Pensions_nGDP <- spend$Pensions / spend$nGDP
spend$Health_nGDP <- spend$Health / spend$nGDP
spend$Education_nGDP <- spend$Education / spend$nGDP
spend$Military_nGDP <- spend$Military / spend$nGDP
spend$Family_nGDP <- spend$Family / spend$nGDP
spend$Unemployment_nGDP <- spend$Unemployment / spend$nGDP
spend$Transport_nGDP <- spend$Transport / spend$nGDP
spend$Religion_nGDP <- spend$Religion / spend$nGDP
spend$Housing_nGDP <- spend$Housing / spend$nGDP
spend$Recreation_nGDP <- spend$Recreation / spend$nGDP
spend$Environment_nGDP <- spend$Environment / spend$nGDP
spend$Government_nGDP <- spend$Government / spend$nGDP
spend$Welfare_nGDP <- spend$Welfare / spend$nGDP
spend$Security_nGDP <- spend$Security / spend$nGDP
spend$EconomicAid_nGDP <- spend$EconomicAid / spend$nGDP

lspend <- melt(spend[, c(1, 104:119)], id.vars = c("Year"))
lspend.GDP <- melt(spend[, c(1, 121:137)], id.vars = c("Year"))
lspend.real <- melt(spend[, c(1, 71, 84:101)], id.vars = c("Year", "GDP.Deflator"))
lspend.real$real <- lspend.real$value / as.numeric(lspend.real$GDP.Deflator)
lspend.real$total <- rep(aggregate(lspend.real$real, by = list(lspend.real$Year), FUN = "sum")$x, times = 18)

gdpdeflator <- data.frame(year = spend$Year, deflator = spend$rGDP / spend$nGDP)

lspend.real$SpendingCategory <- lspend.real$variable
lspend.real$prop <- 100 * lspend.real$real / lspend.real$total
lspend.real$gini <- NA

for (i in min(lspend.real$Year):max(lspend.real$Year))
{
  lsp <- subset(lspend.real, Year == i)
  lsp$gini <- Gini(lsp$prop)
  lspend.real[lspend.real$Year == i, "gini"] <- lsp$gini[1]
}

lspend.GDP.total <- aggregate(lspend.GDP$value, by = list(lspend.GDP$Year), FUN = "sum")

# Combine with yearly content data.

load(".//thetimes.RData")

lspend.GDP.total <- aggregate(lspend.GDP$value, by = list(lspend.GDP$Year), FUN = "sum")

thetimes$spending_GDP <- lspend.GDP.total[match(thetimes$year, lspend.GDP.total$Group.1), "x"]
thetimes[1:16, "spending_GDP"] <- thetimes[17, "spending_GDP"]
thetimes[236, "spending_GDP"] <- thetimes[235, "spending_GDP"]

lspend.gini <- subset(lspend.real, variable == "Economy")

thetimes[, "spending_gini"] <- lspend.gini[match(thetimes$year, lspend.gini$Year), "gini"]
thetimes[1:16, "spending_gini"] <- thetimes[17, "spending_gini"]
thetimes[236, "spending_gini"] <- thetimes[235, "spending_gini"]
thetimes$gini_rev <- 1 - thetimes$spending_gini

6 Analysis

6.1 Salience of crisis coverage

6.1.1 Main analyses

Code
#### Load data
load("to20_year.RData")
load("to50_year.RData")
load("to250_year.RData")
load("wd250x.RData")
load("to_lab.RData")
load("spend.RData")
load("thetimes.RData")

to250_year$dyear <- to250_year$year + 1784

#### Create a data frame of the number of CRISIS NEWS WAVES

fifty <- c(1800, 1850, 1900, 1950, 2000)
y25 <- c(1785, 1800, 1825, 1850, 1875, 1900, 1925, 1950, 1975, 2000, 2020, 2025, 2050)

cnw.count <- (table(wd250x$year))

df.cnw.count <- data.frame(year = 1785:2020, cnw.count = 0)
df.cnw.count$cnw.count <- cnw.count[match(df.cnw.count$year, as.numeric(names(cnw.count)))]
df.cnw.count$cnw <- ifelse(is.na(df.cnw.count$cnw.count), 0, df.cnw.count$cnw.count)


###################################
###################################
###################################
###################################
#### Test hypothesis 1.3: CNW count has increased 1785-2020

count_cnw_year <- ggplot(df.cnw.count, aes(x = year, y = cnw)) +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Count of crisis news waves") +
  geom_text(aes(label = year, size = cnw)) +
  scale_y_log10(breaks = c(5, 10, 20, 30, 40, 50))

ggsave(count_cnw_year, file = "soft_count_cnw_year.svg", unit = "cm", width = 20, height = 10, dpi = 1200, scale = 1.25)

lm.cnw_count.year <- (lm(cnw ~ I(year - 1785), data = df.cnw.count))
summary(lm.cnw_count.year)
confint(lm.cnw_count.year)
pred.cnw_count.year <- predict(lm.cnw_count.year, newdata = data.frame(year = y25))
names(pred.cnw_count.year) <- y25
### B=.026 (.006); t=4.092; p<.001

thetimes$cnw.count <- cnw.share[match(thetimes$year, names(cnw.share))]
thetimes$cl.count <- cl.share[match(thetimes$year, cl.share$dyear), "total.count"]
thetimes$cnw.count <- replace(thetimes$cnw.count, is.na(thetimes$cnw.count), 0)
thetimes$cl.count <- replace(thetimes$cl.count, is.na(thetimes$cl.count), 0)
thetimes$cnw.share <- thetimes$cnw.count / thetimes$articles
thetimes$cl.share <- thetimes$cl.count / thetimes$articles
thetimes$cnw_to_cl.share <- thetimes$cnw.count / thetimes$cl.count
thetimes$cnw <- df.cnw.count$cnw

#### %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ####
#### %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ####
#### %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ####
#### %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ####
#### %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ####
#### %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ####




##################################################
#### Create a data frame of the number of CRISIS NEWS WAVES
####

count_cnw_year <- ggplot(df.cnw.count, aes(x = year, y = cnw)) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Count of crisis news waves")

count_cnw_year2 <- ggplot(df.cnw.count, aes(x = year, y = cnw, label = year)) +
  geom_point(aes(size = cnw), alpha = .2, shape = 17) +
  geom_text(aes(size = cnw)) +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Count of crisis news waves") +
  scale_y_sqrt() +
  theme(legend.position = "none")


share_cnw_year <- ggplot(thetimes, aes(x = year, y = 100 * cnw.share)) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Share of CNW coverage") +
  scale_y_sqrt()

share_cl_year <- ggplot(thetimes, aes(x = year, y = 100 * cl.share)) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Share of CL coverage") +
  scale_y_sqrt()

intensity_cnw <- ggplot(subset(wd250x, volume < 100), aes(y = intensity2, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red", formula = y ~ x) +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Intensity of crisis news waves (articles per day)") +
  xlab("Year") +
  scale_y_sqrt()


volume_cnw <- ggplot(subset(wd250x, volume < 100), aes(y = volume2, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red", formula = y ~ x) +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Volume of crisis news waves") +
  xlab("Year") +
  scale_y_sqrt()

duration_cnw <- ggplot((wd250x), aes(y = duration, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Duration of crisis news waves") +
  xlab("Year") +
  scale_y_sqrt()

ggsave(count_cnw_year2, file = "count_cnw_year.svg", dpi = 1200, width = 16, height = 8, unit = "cm", scale = 1.5)
ggsave(share_cl_year, file = "share_cl_year.svg", dpi = 1200, width = 16, height = 8, unit = "cm", scale = 1.5)
ggsave(share_cnw_year, file = "share_cnw_year.svg", dpi = 1200, width = 16, height = 8, unit = "cm", scale = 1.5)
ggsave(volume_cnw, file = "volume_year.svg", dpi = 1200, width = 16, height = 8, unit = "cm", scale = 1.5)
ggsave(duration_cnw, file = "duration_cnw.svg", dpi = 1200, width = 16, height = 8, unit = "cm", scale = 1.5)
ggsave(intensity_cnw, file = "intensity_cnw.svg", dpi = 1200, width = 16, height = 8, unit = "cm", scale = 1.5)

search_for_K <- ggplot(subset(sk10t, variable %in% c("exclus", "semcoh", "heldout", "residual")), aes(x = K, y = value)) +
  geom_point() +
  geom_line() +
  geom_smooth(color = "darkslategray4") +
  geom_vline(xintercept = 250, color = "red") +
  facet_wrap(. ~ variable, scales = "free_y") +
  theme_bluewhite() +
  ylab("Value")

ggsave(search_for_K, file = "search_for_K.svg", dpi = 1200, scale = 1.5, unit = "cm", width = 16, height = 10)

lm.cl_share.year <- (lm(100 * cl.share ~ I(year - 1785) + 0, data = thetimes))
lm.cnw_share.year <- (lm(100 * cnw.share ~ I(year - 1785) + 0, data = thetimes))
lm.cnw_count.year <- (lm(cnw ~ I(year - 1785), data = thetimes))
lm.cnw_volume.year <- (lm((volume + duration * baseline30) ~ I(as.numeric(year) - 1785) + 0, data = wd250x))
lm.cnw_duration.year <- (lm(duration ~ I(as.numeric(year) - 1785), data = wd250x))
lm.cnw_intensity.year <- (lm(I(intensity + baseline30) ~ I(as.numeric(year) - 1785) + 0, data = wd250x))

ci.cl_share.year <- confint(lm.cl_share.year)
ci.cnw_share.year <- confint(lm.cnw_share.year)
ci.cnw_count.year <- confint(lm.cnw_count.year)
ci.cnw_volume.year <- confint(lm.cnw_volume.year)
ci.cnw_duration.year <- confint(lm.cnw_duration.year)
ci.cnw_intensity.year <- confint(lm.cnw_intensity.year)

pred.cl_share.year <- predict(lm.cl_share.year, newdata = data.frame(year = y25))
pred.cnw_share.year <- predict(lm.cnw_share.year, newdata = data.frame(year = y25))
pred.cnw_count.year <- predict(lm.cnw_count.year, newdata = data.frame(year = y25))
pred.cnw_volume.year <- predict(lm.cnw_volume.year, newdata = data.frame(year = y25))
pred.cnw_duration.year <- predict(lm.cnw_duration.year, newdata = data.frame(year = y25))
pred.cnw_intensity.year <- predict(lm.cnw_intensity.year, newdata = data.frame(year = y25))

names(pred.cl_share.year) <- y25
names(pred.cnw_share.year) <- y25
names(pred.cnw_count.year) <- y25
names(pred.cnw_volume.year) <- y25
names(pred.cnw_duration.year) <- y25
names(pred.cnw_intensity.year) <- y25

topic_spectrum_year <- ggplot(subset(wd250x, !is.na(topic20)), aes(x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), color = topic20_ord, fill = topic20_ord, ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 195)^(1 / 2) - 1)) +
  geom_rect() +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  scale_fill_viridis_d(begin = .1, end = .9) +
  scale_color_viridis_d(begin = .1, end = .9) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  scale_color_manual("Topic", values = rep(c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"), times = 2)) +
  scale_fill_manual("Topic", values = rep(c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"), times = 2)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank())

topic_spectrum_year + geom_smooth(data = thetimes, aes(x = year, y = cnw / 2, xmin = NULL, xmax = NULL, ymin = NULL, ymax = NULL), color = "black", fill = "black")


topic_spectrum_year + geom_smooth(data = thetimes, aes(x = year, y = 100 * cnw.count / max(cnw.count), xmin = NULL, xmax = NULL, ymin = NULL, ymax = NULL), color = "black", fill = "black")

ggsave(topic_spectrum_year, file = "topic_spectrum_year.svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.75)


EPI_topic_spectrum_year <- ggplot(subset(wd250x, topic20 == "EPI"), aes(color = topic20, fill = topic20, x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 50000)^(1 / 2) - 1)) +
  geom_rect(color = "black", alpha = .3) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(label = event_id, y = topic20_num + coord - 1), hjust = 1) +
  geom_text(aes(label = event_label, y = topic20_num + coord - 1), hjust = 0) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank()) +
  scale_color_manual(values = subset(wd250x, topic20 == "EPI")$color20) +
  scale_fill_manual(values = subset(wd250x, topic20 == "EPI")$color20) +
  scale_x_continuous(breaks = seq(1780, 2020, 20))

ggplotly(EPI_topic_spectrum_year)

POL_topic_spectrum_year <- ggplot(subset(wd250x, topic20 == "POL"), aes(color = topic20, fill = topic20, x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 50000)^(1 / 2) - 1)) +
  geom_rect(color = "black", alpha = .3) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(label = event_id, y = topic20_num + coord - 1), hjust = 1) +
  geom_text(aes(label = event_label, y = topic20_num + coord - 1), hjust = 0) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank()) +
  scale_color_manual(values = subset(wd250x, topic20 == "POL")$color20) +
  scale_fill_manual(values = subset(wd250x, topic20 == "POL")$color20) +
  scale_x_continuous(breaks = seq(1780, 2020, 20))

ggplotly(POL_topic_spectrum_year)

TRA_topic_spectrum_year <- ggplot(subset(wd250x, topic20 == "TRA"), aes(color = topic20, fill = topic20, x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 50000)^(1 / 2) - 1)) +
  geom_rect(color = "black", alpha = .3) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(label = event_id, y = topic20_num + coord - 1), hjust = 1) +
  geom_text(aes(label = event_label, y = topic20_num + coord - 1), hjust = 0) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank()) +
  scale_color_manual(values = subset(wd250x, topic20 == "TRA")$color20) +
  scale_fill_manual(values = subset(wd250x, topic20 == "TRA")$color20) +
  scale_x_continuous(breaks = seq(1780, 2020, 20))

ggplotly(TRA_topic_spectrum_year)


ECO_topic_spectrum_year <- ggplot(subset(wd250x, topic20 == "ECO"), aes(color = topic20, fill = topic20, x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 50000)^(1 / 2) - 1)) +
  geom_rect(color = "black", alpha = .3) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(label = paste0("\n\n", event_id), y = topic20_num + coord - 1), hjust = 0.5) +
  geom_text(aes(label = event_label, y = topic20_num + coord - 1), hjust = 0) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank()) +
  scale_color_manual(values = subset(wd250x, topic20 == "ECO")$color20) +
  scale_fill_manual(values = subset(wd250x, topic20 == "ECO")$color20)

ggplotly(ECO_topic_spectrum_year)






EPI_topic_spectrum_year <- ggplot(subset(wd250x, topic20 == "EPI"), aes(color = topic20, fill = topic20, x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 50000)^(1 / 2) - 1)) +
  geom_rect(color = "black", alpha = .3) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(label = event_id, y = topic20_num + coord - 1)) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank()) +
  scale_color_manual(values = subset(wd250x, topic20 == "EPI")$color20) +
  scale_fill_manual(values = subset(wd250x, topic20 == "EPI")$color20)

ECO_topic_spectrum_year <- ggplot(subset(wd250x, topic20 == "ECO"), aes(color = topic20, fill = topic20, x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 50000)^(1 / 2) - 1)) +
  geom_rect(color = "black", alpha = .3) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(label = event_id, y = topic20_num + coord - 1)) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank()) +
  scale_color_manual(values = subset(wd250x, topic20 == "ECO")$color20) +
  scale_fill_manual(values = subset(wd250x, topic20 == "ECO")$color20)



ECO_topic_spectrum_year <- ggplot(subset(wd250x, topic20 == "ECO"), aes(x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), color = topic20_ord, fill = topic20_ord, ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 50000)^(1 / 2) - 1)) +
  geom_rect(color = "black", alpha = .3) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  scale_fill_viridis_d(begin = .1, end = .9) +
  scale_color_viridis_d(begin = .1, end = .9) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  scale_color_manual("Topic", values = rep(c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"), times = 2)) +
  scale_fill_manual("Topic", values = rep(c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"), times = 2)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank())

POL_topic_spectrum_year <- ggplot(subset(wd250x, topic20 == "POL"), aes(x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), color = topic20_ord, fill = topic20_ord, ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 50000)^(1 / 2) - 1)) +
  geom_rect(color = "black", alpha = .3) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  scale_fill_viridis_d(begin = .1, end = .9) +
  scale_color_viridis_d(begin = .1, end = .9) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  scale_color_manual("Topic", values = rep(c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"), times = 2)) +
  scale_fill_manual("Topic", values = rep(c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"), times = 2)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank())

DIS_topic_spectrum_year <- ggplot(subset(wd250x, topic20 == "DIS"), aes(x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), color = topic20_ord, fill = topic20_ord, ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 50000)^(1 / 2) - 1)) +
  geom_rect(color = "black", alpha = .3) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  scale_fill_viridis_d(begin = .1, end = .9) +
  scale_color_viridis_d(begin = .1, end = .9) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  scale_color_manual("Topic", values = rep(c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"), times = 2)) +
  scale_fill_manual("Topic", values = rep(c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"), times = 2)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank())







eco.cnw.count <- data.frame(table(wd250x$year, wd250x$topic20 == "ECO"))
df.eco.cnw.count <- data.frame(year = c(1785:2020, 1785:2020), topic = rep(c("ECO", "non-ECO"), each = 236), cnw.count = 0)

df.eco.cnw.count$cnw.count[1:236] <- subset(eco.cnw.count, Var2 == TRUE)[match(subset(df.eco.cnw.count, topic == "ECO")$year, subset(eco.cnw.count, Var2 == TRUE)$Var1), "Freq"]

df.eco.cnw.count$cnw.count[237:472] <- subset(eco.cnw.count, Var2 == FALSE)[match(subset(df.eco.cnw.count, topic == "non-ECO")$year, subset(eco.cnw.count, Var2 == FALSE)$Var1), "Freq"]

ggplot(df.eco.cnw.count, aes(y = cnw.count, x = year, color = topic, fill = topic, shape = topic)) +
  geom_point() +
  geom_smooth()




##################################################
#### Volume of CRISIS NEWS WAVES
####

ggplot(subset(wd250x, volume < 100), aes(y = volume, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red", formula = y ~ x) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Volume of crisis news waves")

summary(lm(volume ~ as.numeric(year), data = wd250x))
### B=.044 (.004); t=12.34; p<.001

ggplot(subset(wd250x, volume < 100), aes(y = volume, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  facet_wrap(. ~ topic20) +
  ylim(0, 10)

ggplot(subset(wd250x, volume < 100), aes(y = volume, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  facet_wrap(. ~ topic20 == "ECO") +
  ylim(0, 10)

ggplot((wd250x), aes(y = volume, x = as.numeric(year), shape = (topic20 == "ECO"), color = (topic20 == "ECO"))) +
  geom_point() +
  geom_smooth() +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylim(0, 10)


##################################################
#### Duration of CRISIS NEWS WAVES
####

ggplot((wd250x), aes(y = duration, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Duration of crisis news waves")

summary(lm(duration ~ as.numeric(year), data = wd250x))
### B=.840 (.042); t=19.83; p<.001

ggplot((wd250x), aes(y = duration, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  facet_wrap(. ~ topic20) +
  ylim(0, 500)

ggplot((wd250x), aes(y = duration, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  facet_wrap(. ~ topic20 == "ECO") +
  ylim(0, 500)

ggplot((wd250x), aes(y = duration, x = as.numeric(year), shape = (topic20 == "ECO"), color = (topic20 == "ECO"))) +
  geom_point() +
  geom_smooth() +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylim(0, 500)

##################################################
#### Intensity of CRISIS NEWS WAVES
####

ggplot(subset(wd250x, intensity < 0.4), aes(y = intensity, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20))

summary(lm(intensity ~ as.numeric(year), data = wd250x))
### B=.00014487 (.00001336); t=10.846; p<.001




## News volume

newsvolume <- data.frame(year = thetimes$year, articles = thetimes$articles)

editors <- data.frame(
  start = c(1785, 1803, 1812, 1817, 1841, 1877, 1884, 1912, 1919, 1919, 1923, 1941, 1948, 1952, 1967, 1981, 1982, 1985, 1990, 1992, 2002, 2007, 2013),
  end = c(1803, 1812, 1817, 1841, 1877, 1884, 1912, 1919, 1919, 1923, 1941, 1948, 1952, 1967, 1981, 1982, 1985, 1990, 1992, 2002, 2007, 2013, 2020),
  editor = c("Walter I", "Walter II", "Stoddard", "Barnes", "Delane", "Chenery", "Buckle", "Dawson", "Freeman", "Steed", "Dawson", "Barrington-Ward", "Casey", "Haley", "Rees-Mogg", "Evans", "Douglas-Home", "Wilson", "Jenkins", "Stothard", "Thomson", "Harding", "Witherow")
)

owners <- data.frame(
  start = c(1785, 1803, 1847, 1894, 1908, 1922, 1959, 1966, 1976, 1981),
  end = c(1803, 1847, 1894, 1908, 1922, 1959, 1966, 1976, 1981, 2020),
  owner = c("Walter I", "Walter II", "Walter III", "Walter IV", "Harmsworth", "Astor I", "Astor II", "Thomson I", "Thomson II", "Murdoch")
)

events <- data.frame(
  start = c(1803, 1914, 1939, 1935, 1887, 1920, 1814, 1844, 1838, 1848, 1860, 1978),
  end = c(1815, 1918, 1945, 1938, 1888, 1921, 1815, 1845, 1839, 1870, 1866, 1979),
  event = c("Napoleonic Wars", "World War I", "World War II", "Appeasement", "Piggott forgeries", "Zion hoax", "Hi-speed steam press", "Rotary press", "London-Birmingham Postal Railway", "Telegraph network established", "'Walter Press'", "Strike")
)


prices <- data.frame(
  year = c(1788, 1797, 1805, 1814, 1824, 1838, 1847, 1857, 1865, 1873, 1882, 1890, 1907, 1921, 1931, 1939, 1950, 1973, 1980, 1990, 2000, 2007, 2014, 2020),
  price = c(3 / 240, 6 / 240, 6 / 240, 6.5 / 240, 7 / 240, 5 / 240, 5 / 240, 4 / 240, 3 / 240, 3 / 240, 3 / 240, 3 / 240, 3 / 240, 5 / 240, 4 / 240, 2.5 / 240, 3 / 240, 3 / 100, 20 / 100, 35 / 100, 35 / 100, 65 / 100, 120 / 100, 220 / 100)
)

spend$GDP_deflator_i <- as.numeric(c(rep(1.41, times = 11), spend$GDP.Deflator[12:230], rep(106.00, times = 2)))

prices$realprice <- 100 * prices$price / spend[match(prices$year, spend$Year), "GDP_deflator_i"]

circulation <- data.frame(
  year = c(1815, 1852, 1910, 1921, 1930, 1939, 1947, 1956, 1966, 1976, 1980, 1992, 2000, 2005, 2010, 2015, 2019),
  circulation = c(5000, 42384, 45000, 113000, 187000, 204000, 268000, 220716, 282000, 310000, 297000, 386258, 726349, 686327, 508250, 396621, 417298)
)

senseless.topics <- aggregate(long.STM$pr, by = list(long.STM$year, is.na(long.STM$area)), FUN = "sum")
senseless.topics$total <- rep(tapply(senseless.topics[, "x"], senseless.topics$Group.1, "sum"), times = 2)
senseless.topics$share <- senseless.topics$x / senseless.topics$total

gg.senseless.topics <- ggplot(senseless.topics, aes(y = 100 * share, x = as.numeric(Group.1), fill = Group.2)) +
  geom_area(position = "stack") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  scale_color_viridis_d() +
  scale_fill_viridis_d(begin = .2, end = .8, labels = c("Interpretable", "Uninterpretable")) +
  xlab("Year") +
  ylab("Share of non-interpretable issues") +
  guides(fill = guide_legend("Interpretability\nof topic")) +
  theme(legend.position = c(0.7, 0.7))

ggplot(subset(senseless.topics, Group.2 == TRUE), aes(y = 100 * share, x = as.numeric(Group.1))) +
  geom_point() +
  geom_smooth() +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10))

ggsave(gg.senseless.topics, file = "senseless.svg", unit = "cm", width = 16, height = 8, scale = 1.25)


gg.threshold <- ggplot(newsvolume, aes(y = threshold2, x = year)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_point(color = "#7d3c98") +
  geom_line(color = "#7d3c98") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  scale_color_viridis_d() +
  scale_fill_viridis_d(begin = .2, end = .8) +
  xlab("Year") +
  ylab("Thresdhold of volume for\ndetection of crisis events") +
  guides(fill = guide_legend("Minimum wave volume"))


ggsave(gg.threshold, file = "threshold.svg", unit = "cm", width = 16, height = 8, scale = 1.25)



coverage.volume.trajectory <- ggplot() +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue", size = 1) +
  geom_point(data = newsvolume, aes(y = articles, x = year), color = "black") +
  geom_smooth(data = newsvolume, aes(y = articles, x = year), color = "dodgerblue") +
  geom_point(data = circulation, aes(y = circulation / 7, x = year), shape = 15, size = 5, color = "#888888") +
  geom_point(data = prices, aes(y = realprice * 50000, x = year), shape = 16, size = 5, color = "hotpink") +
  geom_text(data = circulation, aes(y = circulation / 7, x = year + 2, label = circulation), color = "#888888", hjust = 0) +
  geom_rect(data = editors, aes(ymin = 1000, ymax = 3000, xmin = start, xmax = end, fill = editor), color = "white") +
  geom_text(data = editors, aes(y = c(rep(c(-1000, -7000, -4000, -10000), times = 5), -7000, -10000, -4000), x = start, label = editor), hjust = 0) +
  geom_rect(data = owners, aes(ymin = 140000, ymax = 142000, xmin = start, xmax = end, fill = owner), color = "white") +
  geom_text(data = owners, aes(y = c(rep(c(138000, 132000, 135000, 129000), times = 2), 138000, 132000), x = start, label = owner), hjust = 0) +
  geom_rect(data = events, aes(ymin = 120000, ymax = 125000, xmin = start, xmax = end, fill = event), color = "white") +
  geom_text(data = events, aes(y = c(118000, 118000, 118000, 115000, 115000, 115000, 112000, 118000, 109000, 106000, 112000, 112000), x = start, label = event), hjust = 0) +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  theme_bluewhite() +
  theme(legend.position = "none")


coverage.volume.trajectory

coverage.volume.trajectory <- ggplot() +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue", size = 1) +
  geom_point(data = newsvolume, aes(y = articles, x = year), color = "black") +
  geom_smooth(data = newsvolume, aes(y = articles, x = year), color = "dodgerblue") +
  geom_point(data = circulation, aes(y = circulation / 7, x = year), shape = 15, size = 4, color = "#888888") +
  geom_point(data = prices, aes(y = realprice * 50000, x = year), shape = 16, size = 4, color = "hotpink") +
  geom_text(data = prices, aes(y = realprice * 50000, x = year, label = round(realprice, 2)), size = 4, color = "hotpink", nudge_y = 4000) +
  geom_text(data = circulation, aes(y = circulation / 7, x = year + 2, label = circulation), color = "#888888", hjust = 0) +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  theme_bluewhite() +
  theme(legend.position = "none") +
  annotate("text", size = 5, x = c(1850, 1930, 1825), y = c(25000, 0, 120000), color = c("black", "darkgrey", "hotpink"), label = c("total stories published", "total circulation", "price (inflation adjusted)"), fontface = "bold")
coverage.volume.trajectory


ggsave(coverage.volume.trajectory, file = "cov_vol_traj.svg", unit = "cm", width = 16, height = 12, scale = 1.5)


backgrounds <- ggplot() +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue", size = 1) +
  geom_rect(data = editors, aes(ymin = 1000, ymax = 3000, xmin = start, xmax = end, fill = editor), color = "white") +
  geom_text(data = editors, aes(y = c(rep(c(-1000, -7000, -4000, -10000), times = 5), -7000, -10000, -4000), x = start, label = editor), hjust = 0) +
  geom_rect(data = owners, aes(ymin = 40000, ymax = 42000, xmin = start, xmax = end, fill = owner), color = "white") +
  geom_text(data = owners, aes(y = c(rep(c(38000, 32000, 35000, 29000), times = 2), 38000, 32000), x = start, label = owner), hjust = 0) +
  geom_rect(data = events, aes(ymin = 20000, ymax = 25000, xmin = start, xmax = end, fill = event), color = "white") +
  geom_text(data = events, aes(y = c(18000, 18000, 18000, 15000, 15000, 15000, 12000, 18000, 9000, 6000, 12000, 12000), x = start, label = event), hjust = 0) +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  theme_bluewhite() +
  theme(legend.position = "none", axis.text.y = element_text(color = "lightskyblue1")) +
  ylab("Events") +
  annotate("text", fontface = "bold", x = c(3000))
backgrounds

cov.vol.traj <- ggarrange(coverage.volume.trajectory, backgrounds, heights = c(2, 1))

ggsave(cov.vol.traj, file = "cov_vol_traj.svg", unit = "cm", width = 16, height = 8, scale = 3)

uniq_count <- function(x) {
  return(length(unique(x)))
}

cl <- data.frame(table(wide.STM$year))

cl_time <- ggplot(cl, aes(x = as.numeric(as.character(Var1)), y = Freq)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.5) +
  geom_point() +
  geom_smooth(span = 0.5, color = "steelblue") +
  theme_bluewhite() +
  xlab("Year") +
  ylab("Count of news articles with CL") +
  scale_x_continuous(breaks = seq(1780, 2020, 10))

ggsave(cl_time, file = "cl_time.svg", device = "svg", unit = "cm", width = 16, height = 8, dpi = 1200, scale = 1.25)


eventcount <- aggregate(wavedata.vol50$TOPIC, by = list(wavedata.vol50$year), FUN = length)
unique_eventcount <- aggregate(wavedata.vol50$TOPIC, by = list(wavedata.vol50$year), FUN = uniq_count)
names(unique_eventcount) <- c("year", "UniqueTopicsCovered")

eventcount_decade <- aggregate(wavedata.vol50$TOPIC, by = list(wavedata.vol50$decade), FUN = length)
unique_eventcount_decade <- aggregate(wavedata.vol50$TOPIC, by = list(wavedata.vol50$decade), FUN = uniq_count)
names(unique_eventcount_decade) <- c("decade", "UniqueTopicsCovered")

eventcount_decade_area <- aggregate(wavedata.vol50$AREA, by = list(wavedata.vol50$decade), FUN = length)
unique_eventcount_decade_area <- aggregate(wavedata.vol50$AREA, by = list(wavedata.vol50$decade), FUN = uniq_count)
names(unique_eventcount_decade_area) <- c("decade", "UniqueAreasCovered")

eventcount_decade_area2 <- aggregate(wavedata.vol50$AREA2, by = list(wavedata.vol50$decade), FUN = length)
unique_eventcount_decade_area2 <- aggregate(wavedata.vol50$AREA2, by = list(wavedata.vol50$decade), FUN = uniq_count)
names(unique_eventcount_decade_area2) <- c("decade", "UniqueAreasCovered")




decade.STM_area2b <- subset(decade.STM_area2, !area == "Epidemics" & !area == "Location")
decade.STM_area2b[decade.STM_area2b$area == "Epidemic", "articles"] <- decade.STM_area2[decade.STM_area2$area == "Epidemic", "articles"] + decade.STM_area2[decade.STM_area2$area == "Epidemics", "articles"]

decade.STM_area2b[decade.STM_area2b$area == "Geopolitical", "articles"] <- decade.STM_area2[decade.STM_area2$area == "Geopolitical", "articles"] + decade.STM_area2[decade.STM_area2$area == "Location", "articles"]


decade.STM_area2b$total <- tapply(decade.STM_area2b$articles, decade.STM_area2b$decade, sum, na.rm = TRUE)
decade.STM_area2b$share <- decade.STM_area2b$articles / decade.STM_area2b$total

year.STM <- aggregate(long.STM$pr, by = list(long.STM$year, long.STM$area2), FUN = "sum", na.rm = TRUE)
names(year.STM) <- c("year", "area2", "count")
year.STM$total <- newsvolume$articles[match(year.STM$year, newsvolume$year)]
year.STM$total <- tapply(year.STM$count, year.STM$year, "sum", na.rm = TRUE)
year.STM$share <- year.STM$count / year.STM$total

year.STM2 <- subset(year.STM, !area2 == "Epidemics" & !area2 == "Location")

year.STM2[year.STM2$area2 == "Epidemic", "share"] <- year.STM[year.STM$area2 == "Epidemic", "share"] + year.STM[year.STM$area2 == "Epidemics", "share"]
year.STM2[year.STM2$area2 == "Epidemic", "count"] <- year.STM[year.STM$area2 == "Epidemic", "count"] + year.STM[year.STM$area2 == "Epidemics", "count"]

year.STM2[year.STM2$area2 == "Geopolitical", "share"] <- year.STM[year.STM$area2 == "Geopolitical", "share"] + year.STM[year.STM$area2 == "Location", "share"]
year.STM2[year.STM2$area2 == "Geopolitical", "count"] <- year.STM[year.STM$area2 == "Geopolitical", "count"] + year.STM[year.STM$area2 == "Location", "count"]

share_CL_traj <- ggplot(year.STM2, aes(y = 100 * share, x = as.numeric(year), color = area2, fill = area2)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_hline(yintercept = 0, color = "red") +
  geom_smooth() +
  geom_point() +
  facet_wrap(~area2, scales = "free_y", ncol = 3) +
  theme_bluewhite() +
  scale_color_viridis_d() +
  scale_fill_viridis_d() +
  theme(legend.position = "none") +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Share of crisis labelling") +
  xlab("Year")

share_CL_traj_share <- ggplot(decade.STM_area2b, aes(y = 100 * share, x = as.numeric(decade), fill = area)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_area(color = "white") +
  theme_bluewhite() +
  scale_color_viridis_d() +
  scale_fill_viridis_d() +
  theme(legend.position = "bottom") +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Share of crisis labelling") +
  xlab("Year")

share_CL_traj_abs <- ggplot(decade.STM_area2b, aes(y = articles, x = as.numeric(decade), fill = area)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_area() +
  theme_bluewhite() +
  scale_color_viridis_d() +
  scale_fill_viridis_d() +
  theme(legend.position = "bottom") +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Number of articles with crisis labelling") +
  xlab("Year")

share_CL_traj_share <- ggplot(decade.STM_area2b, aes(y = 100 * share, x = as.numeric(decade), fill = area)) +
  geom_area(color = "white", show.legend = FALSE) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  theme_bluewhite() +
  scale_color_viridis_d() +
  scale_fill_viridis_d() +
  theme(legend.position = "bottom") +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Share of crisis labelling") +
  xlab("Year") +
  annotate("text",
    x =
      c(1900, 1980, 2008, 1973, 1850, 2000, 1800, 1900, 1830, 2010, 1820, 2000, 2000, 1805, 2020, 2000, 1870, 2015, 1860, 2015), y =
      c(95, 85, 65, 63, 75, 57, 75, 50, 30, 22, 17, 17, 12, 8, 8.5, 5.5, 6, 3, 2.5, 1), label =
      c("Disaster", "Economic", "Education", "Energy", "Epidemic", "Family", "Functional", "Geopolitical", "Government", "Health", "Justice", "Labor", "Leisure", "Military", "Public", "Science", "Society", "Technology", "Transport", "Welfare"), color = c(rep(c("white", "black"), each = 10))
  )


ggsave(share_CL_traj, file = "share_CL_traj.svg", units = "cm", width = 16, height = 20, dpi = 1200, scale = 2)
ggsave(share_CL_traj_share, file = "share_CL_traj_share.svg", units = "cm", width = 16, height = 10, dpi = 1200, scale = 1.75)
ggsave(share_CL_traj_abs, file = "share_CL_traj_abs.svg", units = "cm", width = 16, height = 12, dpi = 1200, scale = 2)


gg_topicspectrum.event.decades <- ggplot(unique_eventcount_decade, aes(x = decade, y = UniqueTopicsCovered)) +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue", size = 1) +
  geom_point(size = 2.5) +
  geom_smooth(method = "lm", fill = "#44aa66", color = "#44aa66") +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77", linetype = "longdash") +
  geom_hline(yintercept = 106, color = "red") +
  theme_bluewhite() +
  ylim(0, 106) +
  ylab("Topics covered by crisis events in the decade") +
  xlab("Decade") +
  scale_x_continuous(breaks = seq(1780, 2020, 10))

gg_areaspectrum.event.decades <- ggplot(unique_eventcount_decade_area, aes(x = decade, y = UniqueAreasCovered)) +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue", size = 1) +
  geom_point(size = 2.5) +
  geom_smooth(method = "lm", fill = "#44aa66", color = "#44aa66") +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77", linetype = "longdash") +
  geom_hline(yintercept = 22, color = "red") +
  theme_bluewhite() +
  ylim(0, 22) +
  ylab("Areas covered by crisis events in the decade") +
  xlab("Decade") +
  scale_x_continuous(breaks = seq(1780, 2020, 10))

gg_areaspectrum2.event.decades <- ggplot(unique_eventcount_decade_area2, aes(x = decade, y = UniqueAreasCovered)) +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue", size = 1) +
  geom_point(size = 2.5) +
  geom_smooth(method = "lm", fill = "#44aa66", color = "#44aa66") +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77", linetype = "longdash") +
  geom_hline(yintercept = 48, color = "red") +
  theme_bluewhite() +
  ylim(0, 48) +
  ylab("Areas covered by crisis events in the decade") +
  xlab("Decade") +
  scale_x_continuous(breaks = seq(1780, 2020, 10))


ggsave(gg_topicspectrum.event.decades, file = "topicspectrum_event_decade.svg", units = "cm", width = 16, height = 8, dpi = 1200, scale = 1.5)
ggsave(gg_areaspectrum.event.decades, file = "areaspectrum_event_decade.svg", units = "cm", width = 16, height = 8, dpi = 1200, scale = 1.5)
ggsave(gg_areaspectrum2.event.decades, file = "area2spectrum_event_decade.svg", units = "cm", width = 16, height = 8, dpi = 1200, scale = 1.5)


crisisarticles <- aggregate(wavedata.vol50$volume, by = list(wavedata.vol50$year), FUN = sum)


crisisarticles$year <- as.numeric(crisisarticles$Group.1)
crisisarticles$volume <- crisisarticles$x
crisisarticles$CNW <- crisisarticles$volume
crisisarticles$CL <- crisisarticles$volume
crisisarticles$Coverage <- crisisarticles$volume

crisis.esc.stages <- data.frame(year = seq(1788, 2020, 1))
crisis.esc.stages$CNW <- crisisarticles$CNW[match(crisis.esc.stages$year, crisisarticles$year)]
crisis.esc.stages$CL <- newsvolume$crisis[match(crisis.esc.stages$year, newsvolume$year)]
crisis.esc.stages$Coverage <- newsvolume$articles[match(crisis.esc.stages$year, newsvolume$year)]

crisis.esc.stages$CNWtoCoverage <- crisis.esc.stages$CNW / crisis.esc.stages$Coverage
crisis.esc.stages$CLtoCoverage <- crisis.esc.stages$CL / crisis.esc.stages$Coverage
crisis.esc.stages$CNWtoCL <- crisis.esc.stages$CNW / crisis.esc.stages$CL

crisis.esc.stages$Coverage100 <- crisis.esc.stages$Coverage / max(crisis.esc.stages$Coverage)
crisis.esc.stages$CL100 <- crisis.esc.stages$CL / max(crisis.esc.stages$CL)
crisis.esc.stages$CNW100 <- crisis.esc.stages$CNW / max(crisis.esc.stages$CNW, na.rm = TRUE)

ggplot(crisis.esc.stages, aes(x = year)) +
  geom_point(aes(y = Coverage100), color = "blue") +
  geom_point(aes(y = CL100), color = "green") +
  geom_point(aes(y = CNW100), color = "red")

CL.time <- ggplot(crisis.esc.stages, aes(x = year, y = 100 * CLtoCoverage)) +
  geom_vline(xintercept = fifty, color = "lightsteelblue", size = 1) +
  geom_point() +
  geom_smooth(span = 0.25, color = "steelblue", fill = "steelblue") +
  theme_bluewhite() +
  ylim(0, 5) +
  ylab("Share of coverage with crisis labelling (CL)") +
  xlab("Year") +
  scale_x_continuous(breaks = seq(1780, 2020, 10))

CL.time.small <- ggplot(crisis.esc.stages, aes(x = year, y = 100 * CLtoCoverage)) +
  geom_vline(xintercept = fifty, color = "lightsteelblue", size = 1) +
  geom_point() +
  geom_smooth(span = 0.25, color = "steelblue", fill = "steelblue") +
  theme_bluewhite() +
  ylim(0, 5) +
  ylab("Share of news stories \n with crisis labelling \n(% of total news coverage)") +
  xlab("Year") +
  scale_x_continuous(breaks = seq(1800, 2000, 50))

ggsave(CL.time.small, file = "CL_time_small.svg", device = "svg", dpi = 1200, unit = "cm", width = 4.4, height = 2.2, scale = 3.00)

ggsave(CL.time, file = "CL_time.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)



CNW.time <- ggplot(crisis.esc.stages, aes(x = year, y = 100 * CNWtoCoverage)) +
  geom_vline(xintercept = fifty, color = "lightsteelblue", size = 1) +
  geom_point() +
  geom_smooth(span = 0.25, color = "steelblue", fill = "steelblue") +
  theme_bluewhite() +
  ylab("Share of coverage that is part of crisis news waves (CNWs)") +
  xlab("Year") +
  scale_x_continuous(breaks = seq(1780, 2020, 10))

ggsave(CNW.time, file = "CNW_time.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)


crisisarticles.time <- ggplot(crisisarticles, aes(y = volume, x = year, label = year, size = (volume))) +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue", size = 1) +
  geom_point(color = "gray", shape = 17) +
  geom_text(color = "black") +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77", na.rm = TRUE) +
  xlab("Year") +
  ylab("Number of articles assigned to crisis events per year") +
  theme_bluewhite() +
  theme(legend.position = "none")
crisisarticles.time <- ggplot(crisisarticles, aes(y = volume, x = year, label = year, size = (volume))) +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue3", size = 1) +
  geom_point(color = "gray", shape = 17) +
  geom_text(color = "black") +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77", na.rm = TRUE) +
  xlab("Year") +
  ylab("Number of articles assigned to crisis events per year") +
  theme_bluewhite() +
  theme(legend.position = "none") +
  scale_y_continuous(trans = "log10") +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  annotation_logticks()



crisisarticles.lm <- lm(volume ~ year, data = crisisarticles)
predict(crisisarticles.lm, newdata = list(year = c(1800, 1850, 1900, 1950, 2000, 2050)))

ggsave(crisisarticles.time, file = "crisisarticles+time.svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.5)





eventcount <- aggregate(wavedata.vol50$duration / wavedata.vol50$duration, by = list(wavedata.vol50$year), FUN = sum)



eventcount$year <- as.numeric(eventcount$Group.1)
eventcount$count <- eventcount$x

eventcount.time <- ggplot(eventcount, aes(y = count, x = year, label = year, size = count)) +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue3", size = 1) +
  geom_point(shape = 17, color = "gray") +
  geom_text(color = "black") +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77", na.rm = TRUE, span = .25) +
  xlab("Year") +
  ylab("Number of crisis events identified") +
  theme_bluewhite() +
  theme(legend.position = "none") +
  scale_x_continuous(breaks = seq(1780, 2020, 10))

eventcount.time.small <- ggplot(eventcount, aes(y = count, x = year, label = year, size = count)) +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue3", size = 1) +
  geom_point(shape = 17, color = "gray") +
  geom_text(color = "black") +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77", na.rm = TRUE, span = .25) +
  xlab("Year") +
  ylab("Number of crisis \n news waves identified") +
  theme_bluewhite() +
  theme(legend.position = "none") +
  scale_x_continuous(breaks = seq(1800, 2000, 50))

ggsave(eventcount.time.small, file = "eventcount_time_small.svg", device = "svg", dpi = 1200, unit = "cm", width = 4.4, height = 2.2, scale = 3.00)

wavedata.vol50 <- wd250x
wavedata.vol50$YEAR <- as.numeric(wavedata.vol50$year)
wavedata.vol50$AREA_i <- to.lab[match(wavedata.vol50$topic250, to.lab$topic250), "topic20"]

areas.by.year <- tapply(wavedata.vol50$YEAR, wavedata.vol50$AREA_i, min, na.rm = TRUE)

wavedata.vol50$AREA_f <- factor(wavedata.vol50$AREA_i, levels = names(areas.by.year)[order(areas.by.year)], ordered = TRUE)

inf <- c(plasma(n = 10, alpha = 1, begin = 0.1, end = 0.9, direction = -1))
infr <- c(viridis(n = 10, alpha = 1, begin = 0.1, end = 0.9, direction = 1))
infscale <- c(inf[1], infr[1], inf[2], infr[2], inf[3], infr[3], inf[4], infr[4], inf[5], infr[5], inf[6], infr[6], inf[7], infr[7], inf[8], infr[8], inf[9], infr[9], inf[10])

eventscope <- ggplot(subset(wavedata.vol50, !is.na(AREA_i)), aes(x = YEAR, y = AREA_f, group = AREA_i, color = AREA_i, fill = AREA_i, size = volume, label = YEAR)) +
  geom_vline(xintercept = fifty, color = "#a99171", size = 1.5) +
  geom_jitter(shape = 15, alpha = .8, width = 0, height = 0.25, show.legend = FALSE) +
  theme_soft() +
  ylab("Topic area") +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  scale_color_manual(values = infscale)

ggsave(eventscope, file = "eventscope.svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.5)

ggplot(subset(wd250x, topic20 == "ENV"), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))

ggplot(subset(wd250x, topic20 == "NRG"), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))

ggplot(subset(wd250x, topic20 == "EPI"), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))

ggplot(subset(wd250x, topic20 == "DIS"), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))

ggplot(subset(wd250x, topic20 == "TRA"), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))

ggplot(subset(wd250x, topic20 == "ECO"), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))

ggplot(subset(wd250x, topic20 == "HEA"), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))

ggplot(subset(wd250x, topic20 %in% c("SCI", "EDU")), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))

ggplot(subset(wd250x, topic20 %in% c("GEO")), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))

ggplot(subset(wd250x, topic20 %in% c("LEI")), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))


eventscope.small <- ggplot(subset(wavedata.vol50, !is.na(AREA_i) & !(AREA_i %in% c("Functional", "Family", "Society", "Education", "ScienceTech", "Public"))), aes(x = YEAR, y = AREA_f, group = AREA_i, color = AREA_i, fill = AREA_i, size = volume, label = YEAR)) +
  geom_vline(xintercept = fifty, color = "lightskyblue2", size = 1.5) +
  geom_jitter(width = 0, height = 0.25, show.legend = FALSE) +
  theme_bluewhite() +
  ylab("Topic area") +
  scale_x_continuous(breaks = seq(1800, 2000, 50))

ggsave(eventscope.small, file = "eventscope_small.svg", dpi = 1200, unit = "cm", width = 4.2, height = 2.1, scale = 3)



eventcount.lm <- lm(count ~ year, data = eventcount)
predict(eventcount.lm, newdata = list(year = c(1800, 1850, 1900, 1950, 2000, 2050)))

summary(lm(x ~ as.numeric(Group.1), data = x))

ggsave(eventcount.time, file = "eventcount+time.svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.5)

firstyear <- tapply(wavedata.vol50$YEAR, wavedata.vol50$topic, min, na.rm = T)
topictrafo <- firstyear[order(firstyear)]
topic.trafo <- data.frame(original = names(topictrafo), modified = 1:120)


wavedata.vol50$topic.trafo <- factor(topic.trafo[match(wavedata.vol50$topic, topic.trafo$original), "modified"])

datebreaks <- as.POSIXct(c(
  "1780-01-01", "1790-01-01", "1800-01-01", "1810-01-01", "1820-01-01", "1830-01-01", "1840-01-01", "1850-01-01",
  "1860-01-01", "1870-01-01", "1880-01-01", "1890-01-01", "1900-01-01",
  "1910-01-01", "1920-01-01", "1930-01-01", "1940-01-01", "1950-01-01",
  "1960-01-01", "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
  "2010-01-01", "2020-01-01"
))

fifty2 <- as.POSIXct(c("1800-01-01", "1850-01-01", "1900-01-01", "1950-01-01", "2000-01-01"))

crisisplot <- ggplot(wavedata.vol50, aes(color = topic.trafo, fill = topic.trafo, xmin = as.POSIXct(start * 60 * 60 * 24, origin = "0000-01-01"), xmax = as.POSIXct((start + duration * 10) * 60 * 60 * 24, origin = "0000-01-01"), ymin = as.numeric(topic.trafo), ymax = as.numeric(topic.trafo) + 2 * intensity)) +
  geom_vline(xintercept = fifty2, size = 1.15, color = "lightblue3") +
  geom_rect(fill = "aliceblue", color = "skyblue3", size = 1.5, aes(ymin = 0, ymax = 110, xmin = as.POSIXct("1788-01-01"), xmax = as.POSIXct("2022-12-31"))) +
  geom_rect() +
  scale_fill_viridis_d(na.value = "grey80") +
  scale_color_viridis_d(na.value = "grey80") +
  theme_bluewhite() +
  ylim(0, 110) +
  ylab("Topic ID") +
  xlab("Year") +
  theme(legend.position = "none") +
  scale_x_continuous(breaks = datebreaks, labels = seq(1780, 2020, 10)) +
  scale_y_continuous(breaks = seq(10, 100, 10), minor_breaks = seq(0, 110, 5))

ggsave(crisisplot, file = "crisisdensity+time.svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

years50 <- c(1800, 1850, 1900, 1950, 2000, 2050)

wavedata.vol50$YEAR <- as.numeric(wavedata.vol50$year)

wavedata.vol50$TotalNewspaperVolume <- thetimes[match(wavedata.vol50$year, thetimes$year), "articles"]

volume.lm <- (lm(volume ~ (YEAR), data = wavedata.vol50))
predict(volume.lm, newdata = list(YEAR = years50))

duration.lm <- (lm(duration ~ (YEAR), data = wavedata.vol50))
predict(duration.lm, newdata = list(YEAR = years50))

intensity.lm <- (lm(intensity ~ (YEAR), data = wavedata.vol50))
predict(intensity.lm, newdata = list(YEAR = years50))

max.intensity.lm <- (lm(max.intensity ~ (YEAR), data = wavedata.vol50))
predict(max.intensity.lm, newdata = list(YEAR = years50))

variability.lm <- (lm(variability ~ (YEAR), data = wavedata.vol50))
predict(variability.lm, newdata = list(YEAR = years50))

summary(lm(duration ~ I(as.numeric(year) - 1788), data = wavedata.vol50))
summary(lm(intensity_i ~ I(as.numeric(year) - 1788), data = wavedata.vol50))
summary(lm(max.intensity ~ I(as.numeric(year) - 1788), data = wavedata.vol50))
summary(lm(variability ~ I(as.numeric(year) - 1788), data = wavedata.vol50))
summary(lm((variability / intensity) ~ I(as.numeric(year) - 1788), data = wavedata.vol50))
summary(lm(baseline365 ~ I(as.numeric(year) - 1788), data = wavedata.vol50))

wavedata.vol50$intensity_i <- wavedata.vol50$volume / wavedata.vol50$duration

intensity.trajectory <- ggplot(wavedata.vol50, aes(x = as.numeric(year), y = intensity_i)) +
  geom_jitter(size = 0.5) +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77") +
  theme_bluewhite() +
  xlab("Year") +
  ylab("Average daily intensity of coverage\nduring the identified crisis event") +
  scale_y_log10(breaks = c(0.05, 0.1, 1, 5), limits = c(0.05, 5)) +
  annotation_logticks() +
  scale_x_continuous(breaks = seq(1780, 2020, 10), minor_breaks = (seq(1780, 2020, 2)))


ggsave(intensity.trajectory, file = "intensity+time.svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

duration.trajectory <- ggplot(wavedata.vol50, aes(x = as.numeric(year), y = duration)) +
  geom_jitter(size = 0.5) +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77") +
  theme_bluewhite() +
  xlab("Year") +
  ylab("Average duration of above-baseline coverage\nduring the identified crisis event") +
  ylim(0, 160) +
  scale_x_continuous(breaks = seq(1780, 2020, 10), minor_breaks = (seq(1780, 2020, 2)))

ggsave(duration.trajectory, file = "duration+time.svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

volume.trajectory <- ggplot(wavedata.vol50, aes(x = as.numeric(year), y = volume)) +
  geom_jitter(size = 0.5) +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77") +
  theme_bluewhite() +
  xlab("Year") +
  ylab("Total volume of coverage\nduring the identified crisis case") +
  ylim(0, 500) +
  scale_x_continuous(breaks = seq(1780, 2020, 10), minor_breaks = seq(1780, 2020, 2)) +
  scale_y_log10(breaks = c(5, 10, 50, 100, 500), limits = c(5, 500)) +
  annotation_logticks()
volume.trajectory

ggsave(volume.trajectory, file = "volume+time.svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

volume.trajectory.small <- ggplot(wavedata.vol50, aes(x = as.numeric(year), y = volume)) +
  geom_jitter(size = 0.5) +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77") +
  theme_bluewhite() +
  xlab("Year") +
  ylab("Total volume of coverage\nduring crisis news waves") +
  ylim(0, 500) +
  scale_x_continuous(breaks = seq(1800, 2000, 50), minor_breaks = seq(1775, 2025, 25)) +
  scale_y_log10(breaks = c(5, 10, 50, 100, 500), limits = c(5, 500)) +
  annotation_logticks()

ggsave(volume.trajectory.small, file = "volume+time_small.svg", dpi = 1200, unit = "cm", width = 4.2, height = 2.1, scale = 3.00)


peak.trajectory <- ggplot(wavedata.vol50, aes(x = as.numeric(year), y = max.intensity)) +
  geom_jitter(size = 0.5) +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77") +
  theme_bluewhite() +
  xlab("Year") +
  ylab("Maximum amount of coverage\nduring the identified crisis case") +
  scale_y_log10(breaks = c(1, 10, 100), limits = c(0.1, 100)) +
  scale_x_continuous(breaks = seq(1780, 2020, 10), minor_breaks = seq(1780, 2020, 2)) +
  annotation_logticks()

ggsave(peak.trajectory, file = "peak+time.svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)





relative.variability.trajectory <- ggplot(wavedata.vol50, aes(x = as.numeric(year), y = variability / intensity)) +
  geom_point() +
  geom_smooth() +
  theme_bluewhite()
variability.trajectory <- ggplot(wavedata.vol50, aes(x = as.numeric(year), y = variability)) +
  geom_point() +
  geom_smooth() +
  theme_bluewhite()


### Total volume of coverage

x <- data.frame(table(wide.STM$year))
newsvolume$crisis <- x[match(newsvolume$year, x$Var1), "Freq"]
newsvolume$crisis.share <- newsvolume$crisis / newsvolume$articles

crisis.labelling <- ggplot(newsvolume, aes(x = year, y = 100 * crisis.share)) +
  geom_vline(xintercept = fifty, size = 1.15, color = "lightblue3") +
  geom_point() +
  geom_smooth(span = 0.25, na.rm = TRUE, fill = "#dd1c77", color = "#dd1c77") +
  geom_smooth(na.rm = TRUE, method = "lm", fill = "#44aa66", color = "#44aa66", ) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10), minor_breaks = seq(1780, 2020, 2)) +
  xlab("Year") +
  ylab("Share of total coverage with crisis labelling")

topic.STM$Total <- sum(topic.STM$x)
topic.STM$Share <- topic.STM$x / topic.STM$Total
topic.STM$Topics <- factor(topic.STM$Group.1, levels = topic.STM$Group.1[order(topic.STM$Share, decreasing = TRUE)], ordered = TRUE)

economic.topics <- ggplot(topic.STM, aes(y = 100 * Share, x = Topics, color = Topics, fill = Topics)) +
  geom_col() +
  geom_text(aes(label = 100 * round(Share, 3), y = 100 * Share + 1), color = "black", size = 3) +
  theme_bluewhite() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylab("Share of topics in coverage with crisis labelling") +
  scale_color_viridis_d() +
  scale_fill_viridis_d() +
  theme(legend.position = "none")

topicareaplot2 <- function(d, area2) {
  d <- d
  d$total <- tapply(d$article, d$decade, sum)[match(d$decade, d$decade[1:25])]
  d$percentage <- round(100 * d$article / d$total, 1)
  gg <- ggplot(d, aes(y = percentage, x = decade, fill = area)) +
    geom_area(color = "white") +
    geom_hline(yintercept = 0, color = "#dd4422") +
    theme_light() +
    ggtitle(area2) +
    scale_x_continuous(breaks = seq(1780, 2020, 10)) +
    scale_fill_viridis_d()
  return(gg)
}

topicareaplot3 <- function(d, area2) {
  d <- d
  d$total <- tapply(d$article, d$decade, sum)[match(d$decade, d$decade[1:25])]
  d$percentage <- round(100 * d$article / d$total, 1)
  gg <- ggplot(d, aes(y = percentage, x = decade, fill = area_ordered)) +
    geom_area(color = "white") +
    geom_hline(yintercept = 0, color = "#dd4422") +
    theme_light() +
    scale_x_continuous(breaks = seq(1780, 2020, 10)) +
    scale_fill_viridis_d()
  return(gg)
}

decade.STM_area2$area_ordered <- factor(decade.STM_area2$area, ordered = TRUE, levels = names(table(decade.STM_area2$area))[order(tapply(decade.STM_area2$articles, decade.STM_area2$area, sum, na.rm = TRUE), decreasing = FALSE)])

economic.topics.time.share <- topicareaplot3(d = decade.STM_area2, area2 = names(table(decade.STM_area2$area))) + geom_vline(xintercept = fifty, size = 1.15, color = "lightblue3") + theme_bluewhite() + scale_fill_manual(values = c("#f7fbff", "#00441b", "#deebf7", "#006d2c", "#c6dbef", "#238b45", "#9ecae1", "#66a182", "#00798c", "#6baed6", "#41ab5d", "#4292c6", "#74c476", "#2171b5", "#a1d99b", "#08519c", "#c7e9c0", "#08306b", "#e5f5e0", "#111111", "#8d96a3", "red")) + theme(legend.position = "right") + ylab("Share of articles with crisis labelling") + guides(fill = guide_legend("Topic area"))

economic.topics.time.abs <- ggplot(data = decade.STM_area2, aes(y = articles, x = decade, fill = area_ordered)) +
  geom_vline(xintercept = fifty, size = 1.15, color = "lightblue3") +
  geom_area() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  theme_bluewhite() +
  scale_fill_manual(values = c("#555555", "#565656", "#666666", "#676767", "#777777", "#787878", "#888888", "#898989", "#999999", "#9a9a9a", "#aaaaaa", "#ababab", "#bbbbbb", "#bcbcbc", "#cccccc", "#cdcdcd", "#dddddd", "#dedede", "#eeeeee", "#fefefe", "#ffffff", "red")) +
  theme(legend.position = c(0.2, .525)) +
  ylab("Count of articles with crisis labelling") +
  guides(fill = guide_legend("Topic area"))

ecotoplist <- names(table(decade.STM_area$area))[2:12]

withineconomic.topics.time.share <- topicareaplot2(d = subset(decade.STM_area, area %in% ecotoplist), area2 = ecotoplist) + theme_bluewhite() + theme(legend.position = "right") + geom_vline(xintercept = fifty, size = 1.15, color = "lightblue3") + ylab("Share of economic crisis labelling")

withineconomic.topics.time.abs <- ggplot(data = subset(decade.STM_area, area %in% ecotoplist), aes(y = articles, x = decade, fill = area)) +
  geom_vline(xintercept = fifty, size = 1.15, color = "lightblue3") +
  geom_area() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  theme_bluewhite() +
  scale_fill_viridis_d() +
  ylab("Number of articles with crisis labelling") +
  theme(legend.position = (c(0.15, 0.5))) +
  guides(fill = guide_legend("Topic within Economy"))

wavedata.vol50$ECO <- ifelse(wavedata.vol50$AREA2 %in% ecotoplist, "Economic Crisis Event", "Non-Economic Crisis Event")

eco.CE.decade <- data.frame(table(wavedata.vol50$decade, wavedata.vol50$ECO))
names(eco.CE.decade) <- c("Decade", "EventType", "Count")
eco.CE.decade$rowmax <- as.numeric(as.character(Recode(eco.CE.decade$EventType, "'Economic Crisis Event'=27;'Non-Economic Crisis Event'=98;else=NA")))
eco.CE.decade$Index <- 100 * eco.CE.decade$Count / eco.CE.decade$rowmax
eco.CE.decade$AllEvents <- rep(subset(eco.CE.decade, EventType == "Economic Crisis Event")$Count + subset(eco.CE.decade, EventType == "Non-Economic Crisis Event")$Count, times = 2)

eco.CE.time <- ggplot(data = subset(eco.CE.decade, EventType == "Economic Crisis Event"), aes(x = as.numeric(as.character(Decade)), y = Count)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_point() +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77") +
  theme_bluewhite() +
  xlab("Decade") +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  ylab("Economic Crisis Event Count")

comp.CE.time.Index <- ggplot(data = eco.CE.decade, aes(x = as.numeric(as.character(Decade)), y = Index, color = EventType, fill = EventType, group = EventType)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_point() +
  geom_smooth() +
  theme_bluewhite() +
  xlab("Decade") +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  theme(legend.position = c(0.825, 0.125)) +
  scale_color_viridis_d(begin = .2, end = .8) +
  scale_fill_viridis_d(begin = .2, end = .8) +
  ylab("Index of Crisis Events (100=maximum)")

comp.CE.time.abs <- ggplot(data = eco.CE.decade, aes(x = as.numeric(as.character(Decade)), y = Count, color = EventType, fill = EventType, group = EventType)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_point() +
  geom_smooth() +
  theme_bluewhite() +
  xlab("Decade") +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  theme(legend.position = c(0.8, 0.8)) +
  scale_color_viridis_d(begin = .2, end = .8) +
  scale_fill_viridis_d(begin = .2, end = .8) +
  ylab("Count of Crisis Events")

comp.CE.time.col <- ggplot(data = eco.CE.decade, aes(x = as.numeric(as.character(Decade)), y = 100 * Count / AllEvents, color = EventType, fill = EventType, group = EventType)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_col() +
  theme_bluewhite() +
  xlab("Decade") +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  theme(legend.position = c(0.2, 0.2)) +
  scale_color_viridis_d(begin = .2, end = .8) +
  scale_fill_viridis_d(begin = .2, end = .8) +
  ylab("Share of Crisis Events")

economic.crisis.types.decades <- ggplot(data = subset(wavedata.vol50, AREA == "Economy"), aes(x = decade, color = AREA2, fill = AREA2)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_bar(position = "fill") +
  theme_bluewhite() +
  xlab("Decade") +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  scale_fill_viridis_d() +
  scale_color_viridis_d()

crisis.type.volume.trajectory <- ggplot(wavedata.vol50, aes(size = volume, x = YEAR, y = volume, color = ECO, fill = ECO, group = ECO)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_point() +
  geom_smooth() +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  scale_color_viridis_d(option = "inferno", begin = .2, end = .8) +
  scale_fill_viridis_d(option = "inferno", begin = .2, end = .8) +
  theme(legend.position = "bottom") +
  guides(size = "none") +
  scale_y_log10() +
  annotation_logticks()

crisis.type.duration.trajectory <- ggplot(wavedata.vol50, aes(size = duration, x = YEAR, y = duration, color = ECO, fill = ECO, group = ECO)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_point(alpha = .3) +
  geom_smooth() +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  scale_color_viridis_d(option = "inferno", begin = .2, end = .8) +
  scale_fill_viridis_d(option = "inferno", begin = .2, end = .8) +
  theme(legend.position = "bottom") +
  guides(size = "none")


eco.CE.year <- data.frame(table(wavedata.vol50$YEAR, wavedata.vol50$ECO))

eco.CE.year2 <- data.frame(year = eco.CE.year[1:172, 1], ECO = eco.CE.year[1:172, 3], NonECO = eco.CE.year[173:344, 3])

fifty <- c(1800, 1850, 1900, 1950, 2000)

crisis.spillover <- ggplot(eco.CE.year2, aes(size = ECO, x = as.numeric(as.character(year)), y = NonECO, color = ECO, fill = ECO)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_point() +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  scale_color_viridis(option = "inferno", begin = .2, end = .8) +
  scale_fill_viridis(option = "inferno", begin = .2, end = .8) +
  theme(legend.position = c(0.15, 0.7)) +
  guides(size = guide_legend("Count of\nEconomic\nCrisis Events"), color = guide_legend("Count of\nEconomic\nCrisis Events"), fill = guide_legend("Count of\nEconomic\nCrisis Events")) +
  xlab("Year") +
  ylab("Count of Non-Economic Crisis Events")

CE.volume <- aggregate(wavedata.vol50$volume, by = list(wavedata.vol50$YEAR), FUN = "sum")

newsvolume$CE.coverage <- CE.volume[match(newsvolume$year, CE.volume$Group.1), "x"]
newsvolume$CE.coverage <- replace(newsvolume$CE.coverage, is.na(newsvolume$CE.coverage), 0)
newsvolume$crisis <- replace(newsvolume$crisis, is.na(newsvolume$crisis), 0)
newsvolume$CEtoCL <- newsvolume$CE.coverage / newsvolume$crisis

CE_to_CL_trajectory <- ggplot(newsvolume, aes(x = year, y = 100 * CEtoCL)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_smooth(color = "deepskyblue4", fill = "deepskyblue4") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10), minor_breaks = seq(1780, 2020, 2)) +
  scale_y_continuous(breaks = seq(0, 100, 20), minor_breaks = seq(0, 100, 10), limits = c(0, 100)) +
  ylab("Share of crisis labelling coverage\nthat is linked to a crisis event")

ggplot(newsvolume, aes(x = year, y = 100 * CE.coverage / articles)) +
  geom_point() +
  geom_smooth()
ggplot(newsvolume, aes(x = year, y = 100 * crisis / articles)) +
  geom_point() +
  geom_smooth()
ggplot(newsvolume, aes(x = year, y = 100 * CE.coverage / crisis)) +
  geom_point() +
  geom_smooth()

### What share of crisis labelling can be assined to specific crisis events?
ggsave(CE_to_CL_trajectory, file = "CE_to_CL_trajectory.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### Distribution of topics in crisis labelling
ggsave(crisis.labelling, file = "CL_trejectory.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### Distribution of topics in crisis labelling
ggsave(economic.topics, file = "eco_CL_share_cs.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the salience of economic topics with crisis labelling developed? Shares.
ggsave(economic.topics.time.share, file = "eco_CL_share.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the salience of economic topics with crisis labelling developed? Absolute numbers.
ggsave(economic.topics.time.abs, file = "eco_CL_abs.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the distribution of economic topics with crisis labelling developed? Shares.
ggsave(withineconomic.topics.time.share, file = "within_eco_CL_share.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the distribution of economic topics with crisis labelling developed? Absolute numbers.
ggsave(withineconomic.topics.time.abs, file = "within_eco_CL_abs.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the count of economic  crisis events developed?
ggsave(eco.CE.time, file = "economic_CE_trajectory.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the count of economic (vs non-economic) crisis events developed?
ggsave(comp.CE.time.Index, file = "economic_vs_noneconomic_CE_trajectory_index.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the count of economic (vs non-economic) crisis events developed?
ggsave(comp.CE.time.abs, file = "economic_vs_noneconomic_CE_trajectory_abs.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the share of economic (vs non-economic) crisis events developed?
ggsave(comp.CE.time.col, file = "economic_vs_noneconomic_CE_trajectory.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the composition of economic crisis event topics developed?
ggsave(economic.crisis.types.decades, file = "within_economic_topics_trajectory.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the volume of economic and non-economic crisis events developed?
ggsave(crisis.type.volume.trajectory, file = "crisis_type_volume_trajectory.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the duration of economic and non-economic crisis events developed?
ggsave(crisis.type.duration.trajectory, file = "crisis_type_duration_trajectory.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### Do economic and non-economic crises co-occur more frequently today?
ggsave(crisis.spillover, file = "crisis_cooccurrence.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

ggplot(epiq, aes(x = YEAR, y = volume, size = volume)) +
  geom_point() +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10))


max.h <- entropy(rep(1 / 120, times = 120), method = "ML")
decades <- seq(1780, 2010, 10)
h <- data.frame(decades = decades, raw.entropy = NA, std.entropy = NA, max.entropy = max.h)

decade.STM$total <- tapply(decade.STM$article, decade.STM$decade, sum, na.rm = TRUE)
decade.STM$share <- decade.STM$article / decade.STM$total

for (i in 1:length(decades))
{
  h[i, "raw.entropy"] <- entropy(subset(decade.STM, decade == decades[[i]])$share, method = "ML")
  h[i, "std.entropy"] <- h[i, "raw.entropy"] / h[i, "max.entropy"]
}

entropy_coverage <- ggplot(subset(h, decades != 2030), aes(x = decades, y = std.entropy)) +
  geom_vline(xintercept = fifty, size = 1.15, color = "lightblue3") +
  geom_point() +
  geom_smooth(method = "lm", fill = "#44aa66", color = "#44aa66") +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77", linetype = "longdash") +
  geom_hline(yintercept = 1, color = "red") +
  theme_light() +
  ylim(0.7, 1) +
  ylab("Standardized Entropy") +
  xlab("Decade") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10))

ggsave(entropy_coverage, file = "topicentropy_decade.svg", units = "cm", width = 16, height = 8, dpi = 1200, scale = 1.35)


CE.Economy <- subset(wavedata.vol50, AREA == "Economy" | AREA == "Energy")
CE.Disaster <- subset(wavedata.vol50, AREA == "Disaster")
CE.Epidemic <- subset(wavedata.vol50, AREA == "Epidemic" | AREA == "Epidemics" | AREA == "Health")
CE.Functional <- subset(wavedata.vol50, AREA == "Functional")
CE.Geopolitical <- subset(wavedata.vol50, AREA == "Geopolitical")
CE.Government <- subset(wavedata.vol50, AREA == "Government")
CE.Justice <- subset(wavedata.vol50, AREA == "Justice")
CE.Military <- subset(wavedata.vol50, AREA == "Military")
CE.Transport <- subset(wavedata.vol50, AREA == "Transport")


ggplot(subset(CE.Epidemic, volume > 50), aes(y = volume, x = year, size = volume)) +
  geom_point(fill = "white", shape = 1) +
  scale_y_log10()

table(CE.Epidemic$wordlist)[table(CE.Epidemic$wordlist) > 0]



save(wavedata, file = "wavedata.RData")
save(wavedata.vol50, file = "wavedata_vol50.RData")
save(wave.keywords, file = "wave_keywords.RData")
save(tss, file = "tss_total.RData")
save(wd5, file = "wd5.RData")

textdirectory1 <- NA
textdirectory2 <- NA

for (w in 1:W)
{
  text.to.output <- wave.keywords[[w]]$texts[1:2, ]
  text1 <- paste(
    (text.to.output[1, 1]), "\n",
    (text.to.output[1, 2]), "\n",
    paste(wave.keywords[[w]]$wordlist, collapse = " "), "\n",
    wave.keywords[[w]]$topic, "\n",
    wave.keywords[[w]]$area, "\n",
    (textfiles.1$newspaper[which(textfiles.1$id == text.to.output[1, 1])]), "\n",
    (textfiles.1$date[which(textfiles.1$id == text.to.output[1, 1])]), "\n",
    (textfiles.1$headline[which(textfiles.1$id == text.to.output[1, 1])]), "\n",
    (textfiles.1$text[which(textfiles.1$id == text.to.output[1, 1])])
  )
  filename1 <- paste0("et", w, "---1", ".txt")
  text2 <- paste(
    (text.to.output[2, 1]), "\n",
    (text.to.output[2, 2]), "\n",
    paste(wave.keywords[[w]]$wordlist, collapse = " "), "\n",
    wave.keywords[[w]]$topic, "\n",
    wave.keywords[[w]]$area, "\n",
    (textfiles.1$newspaper[which(textfiles.1$id == text.to.output[2, 1])]), "\n",
    (textfiles.1$date[which(textfiles.1$id == text.to.output[2, 1])]), "\n",
    (textfiles.1$headline[which(textfiles.1$id == text.to.output[2, 1])]), "\n",
    (textfiles.1$text[which(textfiles.1$id == text.to.output[2, 1])])
  )
  filename2 <- paste0("et", ifelse(w > 999, "0", ifelse(w > 99, "00", ifelse(w > 9, "000", "0000"))), w, "-2", ".txt")

  save.csv(text1, file = filename1)

  textdirectory1[w] <- c(text1)
  textdirectory2[w] <- c(text2)
  print(paste0(w, "/", W))
  flush.console()
}


fileConn <- file("EventText1.txt")
writeLines(textdirectory1, fileConn)
close(fileConn)

fileConn <- file("EventText2.txt")
writeLines(textdirectory2, fileConn)
close(fileConn)


topic_diversity_data <- data.frame(year = NA, indicator = NA, categories = NA, corpus = NA, value = NA)

thetimes_L <- pivot_longer(thetimes3, cols = c("active250", "active50", "active20"), names_to = "categories", values_to = "x")

topic_diversity_data <- rbind(
  topic_diversity_data,
  data.frame(year = thetimes_L$year, indicator = "active", categories = thetimes_L$categories, corpus = "crisis labelling", value = thetimes_L$x),
  data.frame(year = thetimes.d3$decade, indicator = "active", categories = thetimes.d3$name, corpus = "crisis news waves", value = thetimes.d3$value),
  data.frame(year = df.active_L$year, indicator = "active", categories = df.active_L$resolution, corpus = "routine", value = df.active_L$active),
  data.frame(year = thetimes3$year, indicator = "gini", categories = thetimes3$var, corpus = "crisis labelling", value = 1 - thetimes3$value),
  data.frame(year = rep(thetimes$year, times = 3), indicator = "gini", categories = rep(c("21 topic areas", "163 topic complexes", "250 topics"), each = 236), corpus = "crisis news waves", value = c(thetimes$nw.rgini20r, thetimes$nw.rgini50r, thetimes$nw.rgini250r)),
  data.frame(year = str_extract(df.gini_L$year, pattern = "[:digit:]{4,4}"), indicator = "gini", categories = df.gini_L$name, corpus = "routine", value = df.gini_L$value)
)


topic_diversity_data$corp <- factor(topic_diversity_data$corpus, ordered = TRUE, levels = c("routine", "crisis labelling", "crisis news waves"))

topic_diversity_data$cat <- factor(Recode(topic_diversity_data$categories, "'active20'='21 topic areas';'active50'='163 topic complexes';'active250'='250 topics';'nw20d'='21 topic areas';'nw50d'='163 topic complexes';'nw250d'='250 topics';'topic areas(20)'='21 topic areas';'topic complexes(50)'='163 topic complexes';'topic(250)'='250 topics';'topic20'='21 topic areas';'topic50'='163 topic complexes';'topic250'='250 topics'"), ordered = TRUE, levels = c("250 topics", "163 topic complexes", "21 topic areas"))

top_div_dat <- subset(topic_diversity_data, !is.na(value))

ggplot(subset(top_div_dat, cat == "21 topic areas"), aes(y = value, x = as.Date(year, format = "%Y"), linetype = corpus, shape = corpus, color = indicator)) +
  geom_point() +
  geom_smooth()

ggplot(subset(top_div_dat, cat == "163 topic complexes"), aes(y = value, x = as.Date(year, format = "%Y"), linetype = corpus, shape = corpus, color = indicator)) +
  geom_point() +
  geom_smooth()

gg_topic_diversity <- ggplot(top_div_dat, aes(y = value, x = as.Date(year, format = "%Y"), linetype = corp, shape = corp, color = corp)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  theme_bluewhite() +
  scale_fill_viridis_d(option = "inferno", begin = 0, end = 0.85) +
  scale_color_viridis_d(option = "inferno", begin = 0, end = 0.85) +
  facet_grid(cat ~ indicator) +
  xlab("Year") +
  ylab("Share of active topics Diversity (1-Gini)") +
  theme(legend.position = "bottom")

ggsave(gg_topic_diversity, file = "gg_topic_diversity.svg", scale = 1.25, dpi = 1200, unit = "cm", width = 16, height = 16)

thetimes$statistics <- social_statistics_timeline[match(thetimes$year, social_statistics_timeline$year), "indicators"]

thetimes$newspaper_circulation <- media_indicators[match(thetimes$year, media_indicators$year), "newspaper_circulation_i"]
thetimes$TV_households_share <- media_indicators[match(thetimes$year, media_indicators$year), "TV_households_share_i"]
thetimes$Internet_penetration <- media_indicators[match(thetimes$year, media_indicators$year), "Internet_penetration_i"]

thetimes$media_autonomy <- df_bes_year[match(thetimes$year, df_bes_year$year), "no_party_id"]

thetimes$media_autonomy_i <- na_locf(thetimes$media_autonomy)

thetimes$Newspaper_212 <- arima(thetimes$newspaper_circulation, order = c(2, 1, 2))$resid
thetimes$TV_212 <- arima(thetimes$TV_households_share, order = c(2, 1, 2))$resid
thetimes$Internet_212 <- arima(thetimes$Internet_penetration, order = c(2, 1, 2))$resid
thetimes$ORG_212 <- arima(thetimes$ccORG_per_article, order = c(2, 1, 2))$resid
thetimes$PERSON_212 <- arima(thetimes$ccPERSON_per_article, order = c(2, 1, 2))$resid
thetimes$SpendingShare_212 <- arima(thetimes$spending_GDP, order = c(2, 1, 2))$resid
thetimes$SpendingDiversity_212 <- arima(thetimes$gini_rev, order = c(2, 1, 2))$resid
thetimes$Autonomy_212 <- arima(thetimes$media_autonomy_i, order = c(2, 1, 2))$resid
thetimes$Statistics_212 <- arima(thetimes$statistics, order = c(2, 1, 2))$resid
thetimes$CLSHARE_212 <- arima(thetimes$cl.share, order = c(2, 1, 2))$resid

thetimes$Penetration <- rowMaxs(as.matrix(thetimes[, c("newspaper_circulation", "TV_households_share", "Internet_penetration")]))
thetimes$Penetration_212 <- arima(thetimes$Penetration, order = c(2, 1, 2))$resid

thetimes$Autonomy_bi <- 1 * (thetimes$media_autonomy_i > 0.06)


CLS_m6 <- (lm(CLSHARE_212 ~ Penetration_212 * Autonomy_212 + ORG_212 + PERSON_212 + SpendingShare_212 + SpendingDiversity_212 + Statistics_212 + Internet_212, data = thetimes))

CLS_m5 <- (lm(CLSHARE_212 ~ Penetration_212 * Autonomy_212 + ORG_212 + PERSON_212 + SpendingShare_212 + SpendingDiversity_212 + Statistics_212, data = thetimes))

CLS_m4 <- (lm(CLSHARE_212 ~ Penetration_212 * Autonomy_212 + SpendingShare_212 + SpendingDiversity_212 + Statistics_212, data = thetimes))

CLS_m3 <- (lm(CLSHARE_212 ~ Penetration_212 * Autonomy_212 + SpendingShare_212 + SpendingDiversity_212, data = thetimes))

CLS_m2b <- (lm(CLSHARE_212 ~ ORG_212 + PERSON_212, data = thetimes))

CLS_m2c <- (lm(CLSHARE_212 ~ Statistics_212, data = thetimes))

CLS_m2 <- (lm(CLSHARE_212 ~ (Penetration_212) * Autonomy_212, data = thetimes))

CLS_m1 <- (lm(CLSHARE_212 ~ SpendingShare_212 + SpendingDiversity_212, data = thetimes))

CLS_m0 <- (lm(CLSHARE_212 ~ 1, data = thetimes))

CLS_m5b <- (lm(CLSHARE_212 ~ +ORG_212 + PERSON_212 + SpendingShare_212 + SpendingDiversity_212 + Statistics_212, data = thetimes))

thetimes$gini_rev_i <- na_locf(thetimes$gini_rev)

arima_CL1 <- Arima(thetimes$cl.share, xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev")]), order = c(2, 1, 2), include.drift = TRUE)

arima_CL2 <- Arima(thetimes$cl.share, xreg = as.matrix(thetimes[, c("Penetration_212", "Autonomy_212", "Statistics_212", "ORG_212", "PERSON_212", "SpendingShare_212", "SpendingDiversity_212")]), order = c(2, 1, 2), include.drift = TRUE)

lm_CL1 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + ccORG_per_article + ccPERSON_per_article + spending_GDP + gini_rev_i + year, data = thetimes)

lm_CL2 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + ccORG_per_article + ccPERSON_per_article + spending_GDP + gini_rev_i, data = subset(thetimes, year < 2000))

lm_CL3 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + ccORG_per_article + ccPERSON_per_article + spending_GDP + gini_rev_i, data = subset(thetimes, year > 1900))

lm_CL4 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + ccORG_per_article + ccPERSON_per_article + spending_GDP + gini_rev_i, data = subset(thetimes, year < 1900))

lm_CL5 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + ccORG_per_article + ccPERSON_per_article + spending_GDP + gini_rev_i, data = subset(thetimes, year < 1950))

lm_CL6 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + ccORG_per_article + ccPERSON_per_article + spending_GDP + gini_rev_i, data = subset(thetimes, year < 1975))

lm_CL7 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + ccORG_per_article + ccPERSON_per_article + spending_GDP + gini_rev_i, data = subset(thetimes, year < 1990))

lm_CL8 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + ccORG_per_article + ccPERSON_per_article + spending_GDP + gini_rev_i, data = thetimes)

# Prediction trained on the pre-2000 data (forecast for 2000-2020)
thetimes$pred_1 <- stats::predict(lm_CL2, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])

# Prediction trained on the post-1900 data (forecast for 1785-1900)
thetimes$pred_2 <- stats::predict(lm_CL3, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])

# Prediction trained on the pre-1900 data (forecast for 1900-2020)
thetimes$pred_3 <- stats::predict(lm_CL4, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])

# Prediction trained on the pre-1900 data (forecast for 1900-2020)
thetimes$pred_4 <- stats::predict(lm_CL5, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])

# Prediction trained on the pre-1900 data (forecast for 1900-2020)
thetimes$pred_5 <- stats::predict(lm_CL6, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])

# Prediction trained on the pre-1900 data (forecast for 1900-2020)
thetimes$pred_6 <- stats::predict(lm_CL7, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])

# Prediction trained on the pre-1900 data (forecast for 1900-2020)
thetimes$pred_7 <- stats::predict(lm_CL8, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])



ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_1))

ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_2))

ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_3))






lm_CL1X <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i + year, data = thetimes)

lm_CL2X <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i, data = subset(thetimes, year < 2000))

lm_CL3X <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i, data = subset(thetimes, year > 1900))

lm_CL4X <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i, data = subset(thetimes, year < 1900))

lm_CL5X <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i, data = subset(thetimes, year < 1950))

lm_CL6X <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i, data = subset(thetimes, year < 1975))

lm_CL7X <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i, data = subset(thetimes, year < 1990))

lm_CL8X <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i, data = thetimes)


# Prediction trained on the pre-2000 data (forecast for 2000-2020)
thetimes$pred_1X <- stats::predict(lm_CL2X, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])

# Prediction trained on the post-1900 data (forecast for 1785-1900)
thetimes$pred_2X <- stats::predict(lm_CL3X, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])

# Prediction trained on the pre-1900 data (forecast for 1900-2020)
thetimes$pred_3X <- stats::predict(lm_CL4X, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])

# Prediction trained on the pre-1950 data (forecast for 1950-2020)
thetimes$pred_4X <- stats::predict(lm_CL5X, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])

# Prediction trained on the pre-1975 data (forecast for 1975-2020)
thetimes$pred_5X <- stats::predict(lm_CL6X, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])

# Prediction trained on the pre-1990 data (forecast for 1990-2020)
thetimes$pred_6X <- stats::predict(lm_CL7X, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])

# Prediction trained on the full data (no forecast)
thetimes$pred_7X <- stats::predict(lm_CL8X, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])


ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_1X))

ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_2X))

ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_3X))

ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_4X))

ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_5X))

gg_1785_1990 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_6X)) +
  annotate("rect", xmin = 1990, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 1990, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

gg_1785_2000 <- gg_1785_1975 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_1X)) +
  annotate("rect", xmin = 2000, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 2000, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

gg_1900_2020 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_2X)) +
  annotate("rect", xmin = 1785, xmax = 1900, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1900, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

gg_1785_1900 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_3X)) +
  annotate("rect", xmin = 1900, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 1900, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

gg_1785_1950 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_4X)) +
  annotate("rect", xmin = 1950, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 1950, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

gg_1785_1975 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_5X)) +
  annotate("rect", xmin = 1975, xmax = 2020, ymin = 0, ymax = 0.20, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 1975, ymin = 0, ymax = 0.20, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

gg_1785_2020 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_7X)) +
  annotate("rect", xmin = 1785, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")


ggX_1785_1990 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_6X)) +
  annotate("rect", xmin = 1990, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 1990, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

ggX_1785_2000 <- gg_1785_1975 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_1X)) +
  annotate("rect", xmin = 2000, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 2000, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

ggX_1900_2020 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_2X)) +
  annotate("rect", xmin = 1785, xmax = 1900, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1900, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

ggX_1785_1900 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_3X)) +
  annotate("rect", xmin = 1900, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 1900, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

ggX_1785_1950 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_4X)) +
  annotate("rect", xmin = 1950, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 1950, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

ggX_1785_1975 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_5X)) +
  annotate("rect", xmin = 1975, xmax = 2020, ymin = 0, ymax = 0.20, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 1975, ymin = 0, ymax = 0.20, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

ggX_1785_2020 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_7X)) +
  annotate("rect", xmin = 1785, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

prediction_long <- pivot_longer(thetimes, col = c("pred_1", "pred_2", "pred_3", "pred_4", "pred_5", "pred_6", "pred_7", "pred_1X", "pred_2X", "pred_3X", "pred_4X", "pred_5X", "pred_6X", "pred_7X"))

prediction_long$train_start <- car::Recode(prediction_long$name, "'pred_1'=1785;'pred_1X'=1785;'pred_2'=1900;'pred_2X'=1900;'pred_3'=1785;'pred_3X'=1785;'pred_4'=1785;'pred_4X'=1785;'pred_5'=1785;'pred_5X'=1785;'pred_6'=1785;'pred_6X'=1785;'pred_7'=1785;'pred_7X'=1785")

prediction_long$train_end <- car::Recode(prediction_long$name, "'pred_1'=2000;'pred_1X'=2000;'pred_2'=2020;'pred_2X'=2020;'pred_3'=1900;'pred_3X'=1900;'pred_4'=1950;'pred_4X'=1950;'pred_5'=1975;'pred_5X'=1975;'pred_6'=1990;'pred_6X'=1990;'pred_7'=2020;'pred_7X'=2020")

prediction_long$train_end <- ifelse(prediction_long$year == 2020, prediction_long$train_end, NA)

prediction_long$predict_start <- car::Recode(prediction_long$name, "'pred_1'=2000;'pred_1X'=2000;'pred_2'=1785;'pred_2X'=1785;'pred_3'=1900;'pred_3X'=1900;'pred_4'=1950;'pred_4X'=1950;'pred_5'=1975;'pred_5X'=1975;'pred_6'=1990;'pred_6X'=1990;'pred_7'=2020;'pred_7X'=2020")

prediction_long$predict_end <- car::Recode(prediction_long$name, "'pred_1'=2020;'pred_1X'=2020;'pred_2'=1900;'pred_2X'=1900;'pred_3'=2020;'pred_3X'=2020;'pred_4'=2020;'pred_4X'=2020;'pred_5'=2020;'pred_5X'=2020;'pred_6'=2020;'pred_6X'=2020;'pred_7'=2020;'pred_7X'=2020")

prediction_long$predict_end <- ifelse(prediction_long$year == 2020, prediction_long$predict_end, NA)

prediction_long$train <- car::Recode(prediction_long$name, "'pred_1'='1785-2000';'pred_1X'='1785-2000';'pred_2'='1900-2020';'pred_2X'='1900-2020';'pred_3'='1785-1900';'pred_3X'='1785-1900';'pred_4'='1785-1950';'pred_4X'='1785-1950';'pred_5'='1785-1975';'pred_5X'='1785-1975';'pred_6'='1785-1990';'pred_6X'='1785-1990';'pred_7'='1785-2020';'pred_7X'='1785-2020'")

prediction_long$set <- car::Recode(prediction_long$name, "'pred_1'='with_ORG';'pred_1X'='without_ORG';'pred_2'='with_ORG';'pred_2X'='without_ORG';'pred_3'='with_ORG';'pred_3X'='without_ORG';'pred_4'='with_ORG';'pred_4X'='without_ORG';'pred_5'='with_ORG';'pred_5X'='without_ORG';'pred_6'='with_ORG';'pred_6X'='without_ORG';'pred_7'='with_ORG';'pred_7X'='without_ORG'")

prediction_long$range <- car::Recode(prediction_long$name, "'pred_1'=0.01;'pred_1X'=0.01;'pred_2'=0.01;'pred_2X'=0.01;'pred_3'=0.01;'pred_3X'=0.01;'pred_4'=0.01;'pred_4X'=0.01;'pred_5'=0.2;'pred_5X'=0.2;'pred_6'=0.01;'pred_6X'=0.01;'pred_7'=0.01;'pred_7X'=0.01")

ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_5X)) +
  annotate("rect", xmin = 1975, xmax = 2020, ymin = 0, ymax = 0.20, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 1975, ymin = 0, ymax = 0.20, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

gg_calibration <- ggplot(prediction_long, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = value)) +
  geom_rect(aes(xmin = train_start, xmax = train_end, ymax = range), ymin = 0, fill = "darkblue", alpha = .2) +
  geom_rect(aes(xmin = predict_start, xmax = predict_end, ymax = range), ymin = 0, fill = "darkred", alpha = .2) +
  facet_grid(train ~ set, scale = "free_y") +
  theme_soft() +
  ylab("Crisis Labelling Salience")

ggplot(subset(prediction_long, name == "pred_1"), aes(x = year)) +
  geom_rect(aes(xmin = train_start, xmax = train_end), ymin = 0, ymax = 0.025, fill = "darkblue", alpha = .25 / 236) +
  geom_rect(aes(xmin = predict_start, xmax = predict_end), ymin = 0, ymax = 0.025, fill = "darkred", alpha = .25 / 236) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = value)) +
  facet_grid(train ~ set, scale = "free_y") +
  theme_soft() +
  ylab("Crisis Labelling Salience")

gg_calibration <- ggplot(prediction_long, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = value)) +
  geom_rect(aes(xmin = train_start, xmax = train_end, ymax = range), ymin = 0, fill = "darkblue", alpha = .25) +
  geom_rect(aes(xmin = predict_start, xmax = predict_end, ymax = range), ymin = 0, fill = "darkred", alpha = .25) +
  facet_grid(train ~ set, scale = "free_y") +
  theme_soft() +
  ylab("Crisis Labelling Salience")

ggsave(file = "gg_calibration.svg", device = "svg", gg_calibration, unit = "cm", width = 12, height = 16, scale = 1.25, dpi = 1200)

stargazer(lm_CL1, lm_CL2, lm_CL7, lm_CL6, lm_CL5, lm_CL4, lm_CL3, type = "text", column.labels = c("1785-2020", "1785-2000", "1785-1990", "1785-1975", "1785-1950", "1785-1900", "1900-2020"))
stargazer(lm_CL1X, lm_CL2X, lm_CL7X, lm_CL6X, lm_CL5X, lm_CL4X, lm_CL3X, type = "text", column.labels = c("1785-2020", "1785-2000", "1785-1990", "1785-1975", "1785-1950", "1785-1900", "1900-2020"))

with(subset(thetimes, year >= 2000), cor.test(pred_1, cl.share))$estimate^2
with(subset(thetimes, year >= 2000), cor.test(pred_1X, cl.share))$estimate^2

with(subset(thetimes, year <= 1900), cor.test(pred_2, cl.share))$estimate^2
with(subset(thetimes, year <= 1900), cor.test(pred_2X, cl.share))$estimate^2

with(subset(thetimes, year >= 1900), cor.test(pred_3, cl.share))$estimate^2
with(subset(thetimes, year >= 1900), cor.test(pred_3X, cl.share))$estimate^2

with(subset(thetimes, year >= 1950), cor.test(pred_4, cl.share))$estimate^2
with(subset(thetimes, year >= 1950), cor.test(pred_4X, cl.share))$estimate^2

with(subset(thetimes, year >= 1975), cor.test(pred_5, cl.share))$estimate^2
with(subset(thetimes, year >= 1975), cor.test(pred_5X, cl.share))$estimate^2

with(subset(thetimes, year >= 1990), cor.test(pred_6, cl.share))$estimate^2
with(subset(thetimes, year >= 1990), cor.test(pred_6X, cl.share))$estimate^2

with(thetimes, cor.test(pred_7, cl.share))
with(thetimes, cor.test(pred_7X, cl.share))




grid.arrange(gg_1900_2020, gg_1785_1900, gg_1785_1950, gg_1785_1975, gg_1785_1990, gg_1785_2000, gg_1785_2020, ggX_1900_2020, ggX_1785_1900, ggX_1785_1950, ggX_1785_1975, ggX_1785_1990, ggX_1785_2000, ggX_1785_2020, ncol = 2, as.table = FALSE)



library(stargazer)

stargazer(CLS_m0, CLS_m1, CLS_m2b, CLS_m2, CLS_m3, CLS_m4, CLS_m5, CLS_m6, type = "text")



summary(lm(cl.share ~ newspaper_circulation + TV_households_share + Internet_penetration + statistics + (ccPERSON_index) + (ccORG_index) + spending_GDP + gini_rev + year, data = thetimes))



arima_CL.1900_2020 <- Arima(subset(thetimes, year > 1900)[, "cl.share"], xreg = as.matrix(subset(thetimes, year > 1900)[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

arima_CL.1785_1900 <- Arima(subset(thetimes, year < 1900)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1900)[, c("Penetration", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

arima_CL.1785_1950 <- Arima(subset(thetimes, year < 1950)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1950)[, c("Penetration", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

arima_CL.1785_1975 <- Arima(subset(thetimes, year < 1975)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1975)[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

arima_CL.1785_1990 <- Arima(subset(thetimes, year < 1990)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1990)[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

arima_CL.1785_2000 <- Arima(subset(thetimes, year < 2000)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 2000)[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

arima_CL.1785_2020 <- Arima(subset(thetimes, year < 2021)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 2021)[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

dummat <- matrix(rnorm(n = 236, mean = 0, sd = 1))
colnames(dummat) <- "dummy"

thetimes$dummy <- rnorm(n = 236, mean = 0, sd = 1)
thetimes$dummy2 <- rnorm(n = 236, mean = 0, sd = 1)

e_arima_CL.1900_2020 <- Arima(subset(thetimes, year > 1900)[, "cl.share"], xreg = as.matrix(subset(thetimes, year > 1900)[, c("dummy", "dummy2")]), order = c(2, 1, 2), include.drift = TRUE)

e_arima_CL.1785_1900 <- Arima(subset(thetimes, year < 1900)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1900)[, c("dummy", "dummy2")]), order = c(2, 1, 2), include.drift = TRUE)

e_arima_CL.1785_1950 <- Arima(subset(thetimes, year < 1950)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1950)[, c("dummy", "dummy2")]), order = c(2, 1, 2), include.drift = TRUE)

e_arima_CL.1785_1975 <- Arima(subset(thetimes, year < 1975)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1975)[, c("dummy", "dummy2")]), order = c(2, 1, 2), include.drift = TRUE)

e_arima_CL.1785_1990 <- Arima(subset(thetimes, year < 1990)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1990)[, c("dummy", "dummy2")]), order = c(2, 1, 2), include.drift = TRUE)

e_arima_CL.1785_2000 <- Arima(subset(thetimes, year < 2000)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 2000)[, c("dummy", "dummy2")]), order = c(2, 1, 2), include.drift = TRUE)

e_arima_CL.1785_2020 <- Arima(subset(thetimes, year < 2021)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 2021)[, c("dummy", "dummy2")]), order = c(2, 1, 2), include.drift = TRUE)

p_arima_CL.1900_2020 <- Arima(subset(thetimes, year > 1900)[, "cl.share"], xreg = as.matrix(subset(thetimes, year > 1900)[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

p_arima_CL.1785_1900 <- Arima(subset(thetimes, year < 1900)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1900)[, c("Penetration", "statistics", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

p_arima_CL.1785_1950 <- Arima(subset(thetimes, year < 1950)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1950)[, c("Penetration", "statistics", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

p_arima_CL.1785_1975 <- Arima(subset(thetimes, year < 1975)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1975)[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

p_arima_CL.1785_1990 <- Arima(subset(thetimes, year < 1990)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1990)[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

p_arima_CL.1785_2000 <- Arima(subset(thetimes, year < 2000)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 2000)[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

p_arima_CL.1785_2020 <- Arima(subset(thetimes, year < 2021)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 2021)[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)






aarima_CL.1785_2020 <- auto.arima(subset(thetimes, year < 2021)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 2021)[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev")]), include.drift = TRUE)

auto.arima(lm_CL1$resid)

df_fc_1785_1900 <- data.frame(forecast(arima_CL.1785_1900, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])))
df_fc_1785_1950 <- data.frame(forecast(arima_CL.1785_1950, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])))
df_fc_1785_1975 <- data.frame(forecast(arima_CL.1785_1975, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])))
df_fc_1785_1990 <- data.frame(forecast(arima_CL.1785_1990, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])))
df_fc_1785_2000 <- data.frame(forecast(arima_CL.1785_2000, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])))
df_fc_1785_2020 <- data.frame(forecast(arima_CL.1785_2020, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])))
df_fc_1900_2020 <- data.frame(forecast(arima_CL.1900_2020, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])))

p.df_fc_1785_1900 <- data.frame(forecast(p_arima_CL.1785_1900, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "statistics", "spending_GDP", "gini_rev_i")])))
p.df_fc_1785_1950 <- data.frame(forecast(p_arima_CL.1785_1950, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "statistics", "spending_GDP", "gini_rev_i")])))
p.df_fc_1785_1975 <- data.frame(forecast(p_arima_CL.1785_1975, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])))
p.df_fc_1785_1990 <- data.frame(forecast(p_arima_CL.1785_1990, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])))
p.df_fc_1785_2000 <- data.frame(forecast(p_arima_CL.1785_2000, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])))
p.df_fc_1785_2020 <- data.frame(forecast(p_arima_CL.1785_2020, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])))
p.df_fc_1900_2020 <- data.frame(forecast(p_arima_CL.1900_2020, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])))


e.df_fc_1785_1900 <- data.frame(forecast(e_arima_CL.1785_1900, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("dummy", "dummy2")])))
e.df_fc_1785_1950 <- data.frame(forecast(e_arima_CL.1785_1950, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("dummy", "dummy2")])))
e.df_fc_1785_1975 <- data.frame(forecast(e_arima_CL.1785_1975, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("dummy", "dummy2")])))
e.df_fc_1785_1990 <- data.frame(forecast(e_arima_CL.1785_1990, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("dummy", "dummy2")])))
e.df_fc_1785_2000 <- data.frame(forecast(e_arima_CL.1785_2000, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("dummy", "dummy2")])))
e.df_fc_1785_2020 <- data.frame(forecast(e_arima_CL.1785_2020, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("dummy", "dummy2")])))
e.df_fc_1900_2020 <- data.frame(forecast(e_arima_CL.1900_2020, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("dummy", "dummy2")])))

thetimes$afc.1785_1900 <- (df_fc_1785_1900$Point.Forecast)
thetimes$afc.1785_1950 <- (df_fc_1785_1950$Point.Forecast)
thetimes$afc.1785_1975 <- (df_fc_1785_1975$Point.Forecast)
thetimes$afc.1785_1990 <- (df_fc_1785_1990$Point.Forecast)
thetimes$afc.1785_2000 <- (df_fc_1785_2000$Point.Forecast)
thetimes$afc.1785_2020 <- (df_fc_1785_2020$Point.Forecast)
thetimes$afc.1900_2020 <- (df_fc_1900_2020$Point.Forecast)

thetimes$e_afc.1785_1900 <- (e.df_fc_1785_1900$Point.Forecast)
thetimes$e_afc.1785_1950 <- (e.df_fc_1785_1950$Point.Forecast)
thetimes$e_afc.1785_1975 <- (e.df_fc_1785_1975$Point.Forecast)
thetimes$e_afc.1785_1990 <- (e.df_fc_1785_1990$Point.Forecast)
thetimes$e_afc.1785_2000 <- (e.df_fc_1785_2000$Point.Forecast)
thetimes$e_afc.1785_2020 <- (e.df_fc_1785_2020$Point.Forecast)
thetimes$e_afc.1900_2020 <- (e.df_fc_1900_2020$Point.Forecast)

thetimes$p_afc.1785_1900 <- (p.df_fc_1785_1900$Point.Forecast)
thetimes$p_afc.1785_1950 <- (p.df_fc_1785_1950$Point.Forecast)
thetimes$p_afc.1785_1975 <- (p.df_fc_1785_1975$Point.Forecast)
thetimes$p_afc.1785_1990 <- (p.df_fc_1785_1990$Point.Forecast)
thetimes$p_afc.1785_2000 <- (p.df_fc_1785_2000$Point.Forecast)
thetimes$p_afc.1785_2020 <- (p.df_fc_1785_2020$Point.Forecast)
thetimes$p_afc.1900_2020 <- (p.df_fc_1900_2020$Point.Forecast)

arimafc_long <- pivot_longer(thetimes, col = c("afc.1785_1900", "afc.1785_1950", "afc.1785_1975", "afc.1785_1990", "afc.1785_2000", "afc.1785_2020", "afc.1900_2020"))
e_arimafc_long <- pivot_longer(thetimes, col = c("e_afc.1785_1900", "e_afc.1785_1950", "e_afc.1785_1975", "e_afc.1785_1990", "e_afc.1785_2000", "e_afc.1785_2020", "e_afc.1900_2020"))
p_arimafc_long <- pivot_longer(thetimes, col = c("p_afc.1785_1900", "p_afc.1785_1950", "p_afc.1785_1975", "p_afc.1785_1990", "p_afc.1785_2000", "p_afc.1785_2020", "p_afc.1900_2020"))

ggplot(df_fc_1785_1900, aes(y = Point.Forecast, x = 116:351, ymin = Lo.95, ymax = Hi.95)) +
  geom_ribbon(alpha = .25) +
  geom_line()

arimafc_long$value_i <- ifelse(arimafc_long$name == "afc.1900_2020", arimafc_long$value - 0.02750686, arimafc_long$value)

ggplot(arimafc_longX, aes(x = year)) +
  geom_point(aes(y = value), color = "red", shape = 15) +
  geom_point(aes(y = cl.share), color = "black", shape = 18) +
  facet_grid(name ~ .) +
  theme_soft()

arimafc_longX <- arimafc_long[order(arimafc_long$name, arimafc_long$year), ]

p_arimafc_longX <- p_arimafc_long[order(p_arimafc_long$name, p_arimafc_long$year), ]
e_arimafc_longX <- e_arimafc_long[order(e_arimafc_long$name, e_arimafc_long$year), ]

pmeans <- tapply(p_arimafc_longX$value, p_arimafc_longX$name, mean, na.rm = TRUE) - tapply(p_arimafc_longX$cl.share, p_arimafc_longX$name, mean, na.rm = TRUE)
emeans <- tapply(e_arimafc_longX$value, e_arimafc_longX$name, mean, na.rm = TRUE) - tapply(e_arimafc_longX$cl.share, e_arimafc_longX$name, mean, na.rm = TRUE)
xmeans <- tapply(arimafc_longX$value, arimafc_longX$name, mean, na.rm = TRUE) - tapply(arimafc_longX$cl.share, arimafc_longX$name, mean, na.rm = TRUE)

p_arimafc_longX$avg <- as.numeric(pmeans[match(p_arimafc_longX$name, names(pmeans))])
e_arimafc_longX$avg <- as.numeric(pmeans[match(e_arimafc_longX$name, names(emeans))])
arimafc_longX$avg <- as.numeric(pmeans[match(arimafc_longX$name, names(xmeans))])

p_arimafc_longX$value_i <- p_arimafc_longX$value - p_arimafc_longX$avg
e_arimafc_longX$value_i <- e_arimafc_longX$value - e_arimafc_longX$avg
arimafc_longX$value_i <- arimafc_longX$value - arimafc_longX$avg



prediction_longX <- data.frame(
  value = c(prediction_long$value, arimafc_longX$value_i, p_arimafc_longX$value, e_arimafc_longX$value),
  value_i = c(prediction_long$value, arimafc_longX$value_i, p_arimafc_longX$value_i, e_arimafc_longX$value_i),
  year = c(prediction_long$year, arimafc_longX$year, p_arimafc_longX$year, e_arimafc_longX$year),
  cl.share = c(prediction_long$cl.share, arimafc_longX$cl.share, p_arimafc_longX$cl.share, e_arimafc_longX$cl.share),
  train_start = c(prediction_long$train_start, as.numeric(str_extract(arimafc_longX$name, "[:digit:]{4,4}")), as.numeric(str_extract(p_arimafc_longX$name, "[:digit:]{4,4}")), as.numeric(str_extract(e_arimafc_longX$name, "[:digit:]{4,4}"))),
  train_end = c(prediction_long$train_end, as.numeric(str_extract(arimafc_longX$name, "[:digit:]{4,4}$")), as.numeric(str_extract(p_arimafc_longX$name, "[:digit:]{4,4}$")), as.numeric(str_extract(e_arimafc_longX$name, "[:digit:]{4,4}$"))),
  predict_start = c(prediction_long$predict_start, as.numeric(str_extract(arimafc_longX$name, "[:digit:]{4,4}$")), as.numeric(str_extract(p_arimafc_longX$name, "[:digit:]{4,4}$")), as.numeric(str_extract(e_arimafc_longX$name, "[:digit:]{4,4}$"))),
  predict_end = c(prediction_long$predict_end, rep(c(2020), times = c(1180 + 236)), rep(c(1900), times = c(236)), rep(c(2020), times = c(1180 + 236)), rep(c(1900), times = c(236)), rep(c(2020), times = c(1180 + 236)), rep(c(1900), times = c(236))),
  set = c(prediction_long$set, rep("ARIMA_with_ORG", times = 7 * 236), rep("ARIMA_without_ORG", times = 7 * 236), rep("ARIMA_without_predictors", times = 7 * 236)),
  range = c(prediction_long$range, rep(0.01, times = 7 * 236), rep(0.01, times = 7 * 236), rep(0.01, times = 7 * 236)),
  train = c(prediction_long$train, rep(c("1785-1900", "1785-1950", "1785-1975", "1785-1990", "1785-2000", "1785-2020", "1900-2020"), each = 236), rep(c("1785-1900", "1785-1950", "1785-1975", "1785-1990", "1785-2000", "1785-2020", "1900-2020"), each = 236), rep(c("1785-1900", "1785-1950", "1785-1975", "1785-1990", "1785-2000", "1785-2020", "1900-2020"), each = 236))
)

prediction_longX <- prediction_longX[order(prediction_longX$set, prediction_longX$train, prediction_longX$year), ]

prediction_longX$predict_end <- ifelse(prediction_longX$year == 2020, prediction_longX$predict_end, NA)
prediction_longX$train_end <- ifelse(prediction_longX$year == 2020, prediction_longX$train_end, NA)
prediction_longX$range <- 0.01

prediction_longX$predict_start <- ifelse(prediction_longX$set == "ARIMA_with_ORG" & prediction_longX$train == "1900-2020", 1785, prediction_longX$predict_start)
prediction_longX$predict_end <- ifelse(prediction_longX$set == "ARIMA_with_ORG" & prediction_longX$train == "1900-2020", 1900, prediction_longX$predict_end)

prediction_longX$predict_end <- ifelse(prediction_longX$year == 2020, prediction_longX$predict_end, NA)
prediction_longX$train_end <- ifelse(prediction_longX$year == 2020, prediction_longX$train_end, NA)

agg_avg <- aggregate(prediction_longX$value, by = list(prediction_longX$set, prediction_longX$train), median, na.rm = TRUE)
agg_avg$idx <- interaction(agg_avg$Group.1, agg_avg$Group.2)
prediction_longX$idx <- interaction(prediction_longX$set, prediction_longX$train)

prediction_longX$value_avg <- agg_avg[match(prediction_longX$idx, agg_avg$idx), "x"]

prediction_longX$value_x <- prediction_longX$value - prediction_longX$value_avg + mean(prediction_longX$cl.share, na.rm = TRUE)

prediction_longX$TRAIN <- factor(car::Recode(prediction_longX$train, "'1785-1900'='A) 1785-1900';'1785-1950'='B) 1785-1950';'1785-1975'='C) 1785-1975';'1785-1990'='D) 1785-1990';'1785-2000'='E) 1785-2000';'1900-2020'='F) 1900-2020';'1785-2020'='X) 1785-2020'"), ordered = TRUE, levels = c("A) 1785-1900", "B) 1785-1950", "C) 1785-1975", "D) 1785-1990", "E) 1785-2000", "F) 1900-2020", "X) 1785-2020"))



prediction_longX$SET <- factor(
  car::Recode(
    prediction_longX$set,
    "   'ARIMA_with_ORG'            ='T1) ARIMA_T-MG-O';
        'ARIMA_without_ORG'         ='T2) ARIMA_T-MG-.';
        'ARIMA_without_predictors'  ='T3) ARIMA_T-..-.';
        'with_ORG'                  ='T4) Non-ARIMA_.-MG-O';
        'without_ORG'               ='T5) Non-ARIMA_.-MG-.'"
  ),
  ordered = TRUE, levels = c(
    "T1) ARIMA_T-MG-O",
    "T2) ARIMA_T-MG-.",
    "T3) ARIMA_T-..-.",
    "T4) Non-ARIMA_.-MG-O",
    "T5) Non-ARIMA_.-MG-."
  )
)

prediction_longX[with(prediction_longX, (!is.na(train_end) & TRAIN == "F) 1900-2020")), "train_end"] <- 2020
prediction_longX[with(prediction_longX, (!is.na(train_start) & TRAIN == "F) 1900-2020")), "train_start"] <- 1900
prediction_longX[with(prediction_longX, (!is.na(predict_end) & TRAIN == "F) 1900-2020")), "predict_end"] <- 1900
prediction_longX[with(prediction_longX, (!is.na(predict_start) & TRAIN == "F) 1900-2020")), "predict_start"] <- 1785

gg_calibrationX <- ggplot(prediction_longX, aes(x = year)) +
  geom_point(size = 1, shape = 0, aes(y = cl.share)) +
  geom_point(size = 1, color = "red", shape = 16, aes(y = value_x)) +
  facet_grid(TRAIN ~ SET, scale = "fixed") +
  theme_soft() +
  ylab("Crisis Labelling Salience") +
  scale_y_continuous() +
  geom_rect(aes(xmin = train_start, xmax = train_end, ymax = range * 10), ymin = 0.01, fill = "darkblue", alpha = .25) +
  geom_rect(aes(xmin = predict_start, xmax = predict_end, ymax = range * 10), ymin = 0.01, fill = "darkred", alpha = .25) +
  ylim(-0.025, 0.1)

ggsave(file = "ignore_time.png", device = "png", gg_calibrationX, unit = "cm", width = 20, height = 14, scale = 2.00, dpi = 1200)


ggplot(subset(prediction_longX, set == "ARIMA_with_ORG"), aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = value)) +
  geom_rect(aes(xmin = train_start, xmax = train_end, ymax = range), ymin = 0, fill = "darkblue", alpha = .25) +
  geom_rect(aes(xmin = predict_start, xmax = predict_end, ymax = range), ymin = 0, fill = "darkred", alpha = .25) +
  facet_grid(train ~ set, scale = "free_y") +
  theme_soft() +
  ylab("Crisis Labelling Salience")

ggsave(file = "gg_calibrationX.svg", device = "svg", gg_calibrationX, unit = "cm", width = 12, height = 16, scale = 2.00, dpi = 1200)


sqrt(with(subset(prediction_longX, set == "ARIMA_without_ORG" & train == "1785-1900" & year %in% 1900:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "ARIMA_without_ORG" & train == "1785-1950" & year %in% 1950:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "ARIMA_without_ORG" & train == "1785-1975" & year %in% 1975:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "ARIMA_without_ORG" & train == "1785-1990" & year %in% 1990:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "ARIMA_without_ORG" & train == "1785-2000" & year %in% 2000:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "ARIMA_without_ORG" & train == "1785-2020" & year %in% 1785:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "ARIMA_without_ORG" & train == "1900-2020" & year %in% 1785:1900), mean(abs(cl.share - value_x)^2)))


sqrt(with(subset(prediction_longX, set == "ARIMA_without_predictors" & train == "1785-1900" & year %in% 1900:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "ARIMA_without_predictors" & train == "1785-1950" & year %in% 1950:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "ARIMA_without_predictors" & train == "1785-1975" & year %in% 1975:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "ARIMA_without_predictors" & train == "1785-1990" & year %in% 1990:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "ARIMA_without_predictors" & train == "1785-2000" & year %in% 2000:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "ARIMA_without_predictors" & train == "1785-2020" & year %in% 1785:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "ARIMA_without_predictors" & train == "1900-2020" & year %in% 1785:1900), mean(abs(cl.share - value_x)^2)))

sqrt(with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-1900" & year %in% 1900:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-1950" & year %in% 1950:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-1975" & year %in% 1975:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-1990" & year %in% 1990:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-2000" & year %in% 2000:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-2020" & year %in% 1785:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1900-2020" & year %in% 1785:1900), mean(abs(cl.share - value_x)^2)))

sqrt(with(subset(prediction_longX, set == "with_ORG" & train == "1785-1900" & year %in% 1900:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "with_ORG" & train == "1785-1950" & year %in% 1950:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "with_ORG" & train == "1785-1975" & year %in% 1975:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "with_ORG" & train == "1785-1990" & year %in% 1990:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "with_ORG" & train == "1785-2000" & year %in% 2000:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "with_ORG" & train == "1785-2020" & year %in% 1785:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "with_ORG" & train == "1900-2020" & year %in% 1785:1900), mean(abs(cl.share - value_x)^2)))

sqrt(with(subset(prediction_longX, set == "without_ORG" & train == "1785-1900" & year %in% 1900:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "without_ORG" & train == "1785-1950" & year %in% 1950:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "without_ORG" & train == "1785-1975" & year %in% 1975:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "without_ORG" & train == "1785-1990" & year %in% 1990:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "without_ORG" & train == "1785-2000" & year %in% 2000:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "without_ORG" & train == "1785-2020" & year %in% 1785:2020), mean(abs(cl.share - value_x)^2)))
sqrt(with(subset(prediction_longX, set == "without_ORG" & train == "1900-2020" & year %in% 1785:1900), mean(abs(cl.share - value_x)^2)))

dict_crisis <- dictionary(list(crisis = c("crisis.*", "crises.*")))
dict_disaster <- dictionary(list(disaster = c("disaster.*")))
dict_collapse <- dictionary(list(collapse = c("collaps.*")))
dict_recession <- dictionary(list(recession = c("recession.*")))
dict_emergency <- dictionary(list(emergency = c("emergency.*", "emergencies.*")))
dict_catastrophe <- dictionary(list(catastrophe = c("catastroph.*")))
dict_epidemic <- dictionary(list(epidemic = c("epidemic.*", "pandemic.*", "", "")))
dict_breakdown <- dictionary(list(breakdown = c("breakdown.*")))
dict_debacle <- dictionary(list(debacle = c("debacl.*")))

kw_crisis <- tokens_lookup(x = cc.tokens, dictionary = dict_crisis, nomatch = "NO")
kw_disaster <- tokens_lookup(x = cc.tokens, dictionary = dict_disaster, nomatch = "NO")
kw_collapse <- tokens_lookup(x = cc.tokens, dictionary = dict_collapse, nomatch = "NO")
kw_recession <- tokens_lookup(x = cc.tokens, dictionary = dict_recession, nomatch = "NO")
kw_emergency <- tokens_lookup(x = cc.tokens, dictionary = dict_emergency, nomatch = "NO")
kw_catastrophe <- tokens_lookup(x = cc.tokens, dictionary = dict_catastrophe, nomatch = "NO")
kw_epidemic <- tokens_lookup(x = cc.tokens, dictionary = dict_epidemic, nomatch = "NO")
kw_breakdown <- tokens_lookup(x = cc.tokens, dictionary = dict_breakdown, nomatch = "NO")
kw_debacle <- tokens_lookup(x = cc.tokens, dictionary = dict_debacle, nomatch = "NO")

kwsum <- function(x, kw) {
  counts <- lapply(x, pattern = kw, FUN = str_count)
  kwcount <- as.numeric(lapply(counts, FUN = sum))
  return(kwcount)
}

kw_crisis_sum <- kwsum(x = kw_crisis, kw = "crisis")
kw_disaster_sum <- kwsum(x = kw_disaster, kw = "disaster")
kw_collapse_sum <- kwsum(x = kw_collapse, kw = "collapse")
kw_recession_sum <- kwsum(x = kw_recession, kw = "recession")
kw_emergency_sum <- kwsum(x = kw_emergency, kw = "emergency")
kw_catastrophe_sum <- kwsum(x = kw_catastrophe, kw = "catastrophe")
kw_epidemic_sum <- kwsum(x = kw_epidemic, kw = "epidemic")
kw_breakdown_sum <- kwsum(x = kw_breakdown, kw = "breakdown")
kw_debacle_sum <- kwsum(x = kw_debacle, kw = "debacle")

prop.table(table(kw_crisis_sum > 0))
prop.table(table(kw_disaster_sum > 0))
prop.table(table(kw_collapse_sum > 0))
prop.table(table(kw_recession_sum > 0))
prop.table(table(kw_emergency_sum > 0))
prop.table(table(kw_catastrophe_sum > 0))
prop.table(table(kw_epidemic_sum > 0))
prop.table(table(kw_breakdown_sum > 0))
prop.table(table(kw_debacle_sum > 0))


with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-1900" & year %in% 1900:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-1950" & year %in% 1950:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-1975" & year %in% 1975:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-1990" & year %in% 1990:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-2000" & year %in% 2000:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-2020" & year %in% 1785:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1900-2020" & year %in% 1785:1900), cor.test(cl.share, value))$estimate^2

with(subset(prediction_longX, set == "with_ORG" & train == "1785-1900" & year %in% 1900:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "with_ORG" & train == "1785-1950" & year %in% 1950:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "with_ORG" & train == "1785-1975" & year %in% 1975:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "with_ORG" & train == "1785-1990" & year %in% 1990:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "with_ORG" & train == "1785-2000" & year %in% 2000:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "with_ORG" & train == "1785-2020" & year %in% 1785:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "with_ORG" & train == "1900-2020" & year %in% 1785:1900), cor.test(cl.share, value))$estimate^2

with(subset(prediction_longX, set == "without_ORG" & train == "1785-1900" & year %in% 1900:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "without_ORG" & train == "1785-1950" & year %in% 1950:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "without_ORG" & train == "1785-1975" & year %in% 1975:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "without_ORG" & train == "1785-1990" & year %in% 1990:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "without_ORG" & train == "1785-2000" & year %in% 2000:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "without_ORG" & train == "1785-2020" & year %in% 1785:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "without_ORG" & train == "1900-2020" & year %in% 1785:1900), cor.test(cl.share, value))$estimate^2


stargazer(lm_CL1, lm_CL2, lm_CL7, lm_CL6, lm_CL5, lm_CL4, lm_CL3, type = "text")

stargazer(lm_CL1X, lm_CL2X, lm_CL7X, lm_CL6X, lm_CL5X, lm_CL4X, lm_CL3X, type = "text")

lm_CLX1 <- lm(cl.share ~ Penetration * media_autonomy_i, data = thetimes)
lm_CLX2 <- lm(cl.share ~ statistics + spending_GDP + gini_rev_i + ministries, data = thetimes)
lm_CLX3 <- lm(cl.share ~ ccORG1000 + ccPERSON_per_article, data = thetimes)
lm_CLX4 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i + ministries, data = thetimes)
lm_CLX5 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i + ministries + ccORG1000 + ccPERSON_per_article, data = thetimes)

stargazer(lm_CLX1, lm_CLX2, lm_CLX3, lm_CLX4, lm_CLX5, type = "text")

ORGlm_3Y <- (lm(ccORG1000 ~ spending_GDP + gini_rev_i + Penetration + media_autonomy_i + 1 + dyear, data = thetimes))

ORGlm_3 <- (lm(ccORG1000 ~ spending_GDP + gini_rev_i + Penetration + media_autonomy_i + 1, data = thetimes))

ORGlm_2Y <- (lm(ccORG1000 ~ spending_GDP + gini_rev_i + 1 + dyear, data = thetimes))

ORGlm_2 <- (lm(ccORG1000 ~ spending_GDP + gini_rev_i + 1, data = thetimes))

ORGlm_1Y <- (lm(ccORG1000 ~ Penetration + media_autonomy_i + 1 + dyear, data = thetimes))

ORGlm_1 <- (lm(ccORG1000 ~ Penetration + media_autonomy_i + 1, data = thetimes))

ORGlm_0Y <- (lm(ccORG1000 ~ 1 + dyear, data = thetimes))

ORGlm_0 <- (lm(ccORG1000 ~ 1, data = thetimes))

stargazer(ORGlm_0, ORGlm_1, ORGlm_2, ORGlm_3, type = "html", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)
stargazer(ORGlm_0Y, ORGlm_1Y, ORGlm_2Y, ORGlm_3Y, type = "html", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)

anova(ORGlm_1, ORGlm_3)
anova(ORGlm_2, ORGlm_3)

thetimes$CLS_100 <- Arima(thetimes$cl.share, order = c(1, 0, 0))$resid

thetimes$CLS_010 <- Arima(thetimes$cl.share, order = c(0, 1, 0))$resid

thetimes$CLS_002 <- Arima(thetimes$cl.share, order = c(0, 0, 2))$resid



CClm_x0 <- (lm(CLS_002 ~ 1, data = thetimes))

CClm_x1 <- (lm(CLS_002 ~ Penetration + media_autonomy_i + 1, data = thetimes))

CClm_x1b <- (lm(CLS_002 ~ spending_GDP + gini_rev_i + 1, data = thetimes))

CClm_x1c <- (lm(CLS_002 ~ ccORG_per_article + 1, data = thetimes))

CClm_x2 <- (lm(CLS_002 ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + 1, data = thetimes))

CClm_x3 <- (lm(CLS_002 ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG_per_article + 1, data = thetimes))


CClm_0 <- (lm(CLS_010 ~ 1, data = thetimes))

CClm_1 <- (lm(CLS_010 ~ Penetration + media_autonomy_i + 1, data = thetimes))

CClm_1b <- (lm(CLS_010 ~ spending_GDP + gini_rev_i + 1, data = thetimes))

CClm_1c <- (lm(CLS_010 ~ ccORG_per_article + 1, data = thetimes))

CClm_2 <- (lm(CLS_010 ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + 1, data = thetimes))

CClm_3 <- (lm(CLS_010 ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG_per_article + 1, data = thetimes))


CClm_0 <- (lm(CLS_100 ~ 1, data = thetimes))

CClm_1 <- (lm(CLS_100 ~ Penetration + media_autonomy_i + 1, data = thetimes))

CClm_1b <- (lm(CLS_100 ~ spending_GDP + gini_rev_i + 1, data = thetimes))

CClm_1c <- (lm(CLS_100 ~ ccORG_per_article + 1, data = thetimes))

CClm_2 <- (lm(CLS_100 ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + 1, data = thetimes))

CClm_3 <- (lm(CLS_100 ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG_per_article + 1, data = thetimes))


thetimes$CL_pcent <- thetimes$cl.share * 100
thetimes$CNW_pcent <- thetimes$cnw.share * 100

thetimes$dyear <- thetimes$year - 1785

CClm_0 <- (lm(CL_pcent ~ 1, data = thetimes))

CClm_1 <- (lm(CL_pcent ~ Penetration + media_autonomy_i + 1, data = thetimes))

CClm_1b <- (lm(CL_pcent ~ spending_GDP + gini_rev_i + 1, data = thetimes))

CClm_1c <- (lm(CL_pcent ~ ccORG1000 + 1, data = thetimes))

CClm_2 <- (lm(CL_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + 1, data = thetimes))

CClm_3 <- (lm(CL_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000 + 1, data = thetimes))

CClm_0Y <- (lm(CL_pcent ~ dyear + 1, data = thetimes))

CClm_1aY <- (lm(CL_pcent ~ Penetration + media_autonomy_i + dyear, data = thetimes))

CClm_1bY <- (lm(CL_pcent ~ spending_GDP + gini_rev_i + dyear, data = thetimes))

CClm_1cY <- (lm(CL_pcent ~ ccORG1000 + dyear, data = thetimes))

CClm_2Y <- (lm(CL_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + 1 + dyear, data = thetimes))

CClm_3Y <- (lm(CL_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000 + 1 + dyear, data = thetimes))

anova(CClm_0, CClm_1)

stargazer(CClm_0, CClm_1, CClm_1b, CClm_1c, CClm_2, CClm_3, type = "html", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)
stargazer(CClm_0Y, CClm_1aY, CClm_1bY, CClm_1cY, CClm_2Y, CClm_3Y, type = "html", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)

anova(CClm_0, CClm_1, CClm_2, CClm_3)

stargazer(ORGlm_0, ORGlm_1, ORGlm_2, ORGlm_3, type = "html", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)

stargazer(CClm_0, CClm_1, CClm_1b, CClm_1c, CClm_2, CClm_3, CClm_Y, type = "text")

med_penetration <- mediate(model.m = ORGlm_3, model.y = CClm_3Y, mediator = "ccORG1000", treat = "Penetration")
med_autonomy <- mediate(model.m = ORGlm_3, model.y = CClm_3Y, mediator = "ccORG1000", treat = "media_autonomy_i")
med_spending <- mediate(model.m = ORGlm_3, model.y = CClm_3Y, mediator = "ccORG1000", treat = "spending_GDP")
med_diversity <- mediate(model.m = ORGlm_3, model.y = CClm_3Y, mediator = "ccORG1000", treat = "gini_rev_i")

sensitivity_penetration <- medsens(med_penetration, effect.type = "both")
sensitivity_autonomy <- medsens(med_autonomy, effect.type = "both")
sensitivity_spending <- medsens(med_spending, effect.type = "both")
sensitivity_diversity <- medsens(med_diversity, effect.type = "both")

thetimes$ccORG1000 <- thetimes$ccORG_per_article * 1000
thetimes$logCNW <- log(thetimes$cnw + 1)

lmodel <- "
            CL_pcent ~          1 + d*ccORG1000 + spending_GDP    + gini_rev_i    + Penetration    + media_autonomy_i
            CNW_pcent ~         1 + c*ccORG1000 + spending_GDP    + gini_rev_i    + Penetration    + media_autonomy_i
            logCNW ~                1 + b*ccORG1000 + spending_GDP    + gini_rev_i    + Penetration    + media_autonomy_i
            ccORG1000 ~ 1 +                       a1*spending_GDP + a2*gini_rev_i + a3*Penetration + a4*media_autonomy_i
            intensity_cls:=a1*d
            intensity_cnws:=a1*c
            intensity_cnw:=a1*b
            diversity_cls:=a2*d
            diversity_cnws:=a2*c
            diversity_cnw:=a2*b
            penetration_cls:=a3*d
            penetration_cnws:=a3*c
            penetration_cnw:=a3*b
            autonomy_cls:=a4*d
            autonomy_cnws:=a4*c
            autonomy_cnw:=a4*b
            "

lmodel2 <- "
            CL_pcent ~          1 + d*ccORG1000
            CNW_pcent ~         1 + c*ccORG1000
            logCNW ~                1 + b*ccORG1000
            ccORG1000 ~ 1 +                       a1*spending_GDP + a2*gini_rev_i + a3*Penetration + a4*media_autonomy_i
            intensity_cls:=a1*d
            intensity_cnws:=a1*c
            intensity_cnw:=a1*b
            diversity_cls:=a2*d
            diversity_cnws:=a2*c
            diversity_cnw:=a2*b
            penetration_cls:=a3*d
            penetration_cnws:=a3*c
            penetration_cnw:=a3*b
            autonomy_cls:=a4*d
            autonomy_cnws:=a4*c
            autonomy_cnw:=a4*b
            "

lmodel3 <- "
            CL_pcent ~          spending_GDP    + gini_rev_i    + Penetration    + media_autonomy_i
            CNW_pcent ~         spending_GDP    + gini_rev_i    + Penetration    + media_autonomy_i
            logCNW ~            spending_GDP    + gini_rev_i    + Penetration    + media_autonomy_i
"


lfit <- sem(model = lmodel, data = thetimes)

lfit2 <- sem(model = lmodel2, data = thetimes)
lfit3 <- sem(model = lmodel3, data = thetimes)


CNWlm_0x <- (lm(CNW_pcent ~ 1 + ccORG1000 + articles, data = thetimes))


CNWlm_0 <- (lm(CNW_pcent ~ 1, data = thetimes))

CNWlm_0Y0 <- (lm(CNW_pcent ~ 0 + dyear, data = thetimes))


CNWlm_1a <- (lm(CNW_pcent ~ Penetration + media_autonomy_i, data = thetimes))

CNWlm_1b <- (lm(CNW_pcent ~ spending_GDP + gini_rev_i, data = thetimes))

CNWlm_2 <- (lm(CNW_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes))

CNWlm_3 <- (lm(CNW_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes))


CNWlm_0Y <- (lm(CNW_pcent ~ dyear, data = thetimes))

CNWlm_1aY <- (lm(CNW_pcent ~ Penetration + media_autonomy_i + dyear, data = thetimes))

CNWlm_1bY <- (lm(CNW_pcent ~ spending_GDP + gini_rev_i + dyear, data = thetimes))

CNWlm_2Y <- (lm(CNW_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + dyear, data = thetimes))

CNWlm_3Y <- (lm(CNW_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000 + dyear, data = thetimes))

stargazer(CNWlm_0, CNWlm_1a, CNWlm_1b, CNWlm_2, CNWlm_3, type = "html", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)
stargazer(CNWlm_0Y, CNWlm_1aY, CNWlm_1bY, CNWlm_2Y, CNWlm_3Y, type = "html", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)

anova(CNWlm_1, CNWlm_2, CNWlm_3)

stargazer(CNWlm_1, CNWlm_2, CNWlm_3, type = "text")

med_penetration2 <- mediate(model.m = ORGlm_3, model.y = CNWlm_3Y, mediator = "ccORG1000", treat = "Penetration")
med_autonomy2 <- mediate(model.m = ORGlm_3, model.y = CNWlm_3Y, mediator = "ccORG1000", treat = "media_autonomy_i")
med_spending2 <- mediate(model.m = ORGlm_3, model.y = CNWlm_3Y, mediator = "ccORG1000", treat = "spending_GDP")
med_diversity2 <- mediate(model.m = ORGlm_3, model.y = CNWlm_3Y, mediator = "ccORG1000", treat = "gini_rev_i")

sensitivity_penetration2 <- medsens(med_penetration2, effect.type = "both")
sensitivity_autonomy2 <- medsens(med_autonomy2, effect.type = "both")
sensitivity_spending2 <- medsens(med_spending2, effect.type = "both")
sensitivity_diversity2 <- medsens(med_diversity2, effect.type = "both")



thetimes$logCNW <- log(thetimes$cnw + 1)

COUNTlm_0 <- (lm(logCNW ~ 1, data = thetimes))

COUNTlm_1a <- (lm(logCNW ~ Penetration + media_autonomy_i, data = thetimes))

COUNTlm_1b <- (lm(logCNW ~ spending_GDP + gini_rev_i, data = thetimes))

COUNTlm_1c <- (lm(logCNW ~ ccORG1000, data = thetimes))

COUNTlm_2 <- (lm(logCNW ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes))

COUNTlm_3 <- (lm(logCNW ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes))

COUNTlm_0Y <- (lm(logCNW ~ 1 + dyear, data = thetimes))

COUNTlm_1aY <- (lm(logCNW ~ Penetration + media_autonomy_i + dyear, data = thetimes))

COUNTlm_1bY <- (lm(logCNW ~ spending_GDP + gini_rev_i + dyear, data = thetimes))

COUNTlm_1cY <- (lm(logCNW ~ ccORG1000 + dyear, data = thetimes))

COUNTlm_2Y <- (lm(logCNW ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + dyear, data = thetimes))

COUNTlm_3Y <- (lm(logCNW ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000 + dyear, data = thetimes))

stargazer(COUNTlm_0, COUNTlm_1a, COUNTlm_1b, COUNTlm_1c, COUNTlm_2, COUNTlm_3, type = "html", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)

stargazer(COUNTlm_0Y, COUNTlm_1aY, COUNTlm_1bY, COUNTlm_1cY, COUNTlm_2Y, COUNTlm_3Y, type = "html", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)




COUNTlm_0 <- (MASS::glm.nb(cnw ~ 1, data = thetimes))

COUNTlm_1 <- (glm.nb(cnw ~ Penetration + media_autonomy_i, data = thetimes))

COUNTlm_1b <- (glm.nb(cnw ~ spending_GDP + gini_rev_i, data = thetimes))

COUNTlm_1c <- (glm.nb(cnw ~ ccORG_per_article, data = thetimes))

COUNTlm_2 <- (glm.nb(cnw ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes))

COUNTlm_3 <- (glm.nb(I(cnw + 1) ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG_per_article, data = thetimes))

COUNTlm_3X <- (glm(cnw ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG_per_article, data = thetimes))


COUNTglm_0 <- (glm(cnw ~ 1, data = thetimes, family = "poisson"))
COUNTglm_1 <- (glm(cnw ~ 1 + Penetration + media_autonomy_i, data = thetimes, family = "poisson"))
COUNTglm_1b <- (glm(cnw ~ 1 + spending_GDP + gini_rev_i, data = thetimes, family = "poisson"))
COUNTglm_1c <- (glm(cnw ~ 1 + ccORG_per_article, data = thetimes, family = "poisson"))
COUNTglm_2 <- (glm(cnw ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes, family = "poisson"))
COUNTglm_3 <- (glm(cnw ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG_per_article, data = thetimes, family = "poisson"))

COUNTglm_3 <- (lm(log(cnw + 1) ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG_per_article, data = thetimes))



COUNTziglm_0 <- (zeroinfl(cnw ~ 1 + dyear, data = thetimes, dist = "poisson"))
COUNTziglm_1 <- (zeroinfl(cnw ~ 1 + Penetration + media_autonomy_i + dyear, data = thetimes, dist = "poisson"))
COUNTziglm_1b <- (zeroinfl(cnw ~ 1 + spending_GDP + gini_rev_i + dyear, data = thetimes, dist = "poisson"))
COUNTziglm_1c <- (zeroinfl(cnw ~ 1 + ccORG_per_article + dyear, data = thetimes, dist = "poisson"))
COUNTziglm_2 <- (zeroinfl(cnw ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + dyear, data = thetimes, dist = "poisson"))
COUNTziglm_3 <- (zeroinfl(cnw ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG_per_article + dyear, data = thetimes, dist = "poisson"))


COUNTlm_0 <- (lm(log(cnw + 1) ~ 1 + year, data = thetimes))
COUNTlm_1 <- (lm(log(cnw + 1) ~ 1 + Penetration + media_autonomy_i + year, data = thetimes))
COUNTlm_1b <- (lm(log(cnw + 1) ~ 1 + spending_GDP + gini_rev_i + year, data = thetimes))
COUNTlm_1c <- (lm(log(cnw + 1) ~ 1 + ccORG_per_article + year, data = thetimes))
COUNTlm_2 <- (lm(log(cnw + 1) ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + year, data = thetimes))
COUNTlm_3 <- (lm(log(cnw + 1) ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG_per_article + year, data = thetimes))



anova(COUNTlm_0, COUNTlm_1)
anova(COUNTlm_0, COUNTlm_1b)
anova(COUNTlm_0, COUNTlm_1c)

anova(COUNTlm_2, COUNTlm_3)

stargazer(COUNTlm_1, COUNTlm_1b, COUNTlm_1c, COUNTlm_2, COUNTlm_3, type = "text")

cor.test(thetimes$CNW, predict.glm(COUNTlm_0))$est^2
cor.test(thetimes$CNW, predict.glm(COUNTlm_1))$est^2
cor.test(thetimes$CNW, predict.glm(COUNTlm_1b))$est^2
cor.test(thetimes$CNW, predict.glm(COUNTlm_1c))$est^2
cor.test(thetimes$CNW, predict.glm(COUNTlm_2))$est^2
cor.test(thetimes$CNW, predict.glm(COUNTlm_3))$est^2

med_penetration3 <- mediate(model.m = ORGlm_3, model.y = COUNTlm_3Y, mediator = "ccORG1000", treat = "Penetration")
med_autonomy3 <- mediate(model.m = ORGlm_3, model.y = COUNTlm_3Y, mediator = "ccORG1000", treat = "media_autonomy_i")
med_spending3 <- mediate(model.m = ORGlm_3, model.y = COUNTlm_3Y, mediator = "ccORG1000", treat = "spending_GDP")
med_diversity3 <- mediate(model.m = ORGlm_3, model.y = COUNTlm_3Y, mediator = "ccORG1000", treat = "gini_rev_i")

med_penetration3X <- mediate(model.m = ORGlm_1, model.y = COUNTlm_3X, mediator = "ccORG1000", treat = "Penetration")
med_autonomy3X <- mediate(model.m = ORGlm_1, model.y = COUNTlm_3X, mediator = "ccORG1000", treat = "media_autonomy_i")
med_spending3X <- mediate(model.m = ORGlm_2, model.y = COUNTlm_3X, mediator = "ccORG1000", treat = "spending_GDP")
med_diversity3X <- mediate(model.m = ORGlm_2, model.y = COUNTlm_3X, mediator = "ccORG1000", treat = "gini_rev_i")


sensitivity_penetration3 <- medsens(med_penetration3, effect.type = "both")
sensitivity_autonomy3 <- medsens(med_autonomy3, effect.type = "both")
sensitivity_spending3 <- medsens(med_spending3, effect.type = "both")
sensitivity_diversity3 <- medsens(med_diversity3, effect.type = "both")


thetimes$CL_z <- 100 * scale(thetimes$CL_pcent, scale = TRUE)
thetimes$CNW_z <- 100 * scale(thetimes$CNW_pcent, scale = TRUE)

thetimes$CL_mmx <- (thetimes$CL_pcent / max(thetimes$CL_pcent, na.rm = TRUE))
thetimes$CNW_mmx <- (thetimes$CNW_pcent / max(thetimes$CNW_pcent, na.rm = TRUE))

thetimes$CL_mm <- 100 * (thetimes$CL_mmx - mean(thetimes$CL_mmx, na.rm = TRUE))
thetimes$CNW_mm <- 100 * (thetimes$CNW_mmx - mean(thetimes$CNW_mmx, na.rm = TRUE))

CNWlm_0z <- (lm(CNW_z ~ 1, data = thetimes))

CNWlm_tz <- (lm(CNW_z ~ dyear, data = thetimes))

CNWlm_1z <- (lm(CNW_z ~ Penetration + media_autonomy_i, data = thetimes))

CNWlm_2z <- (lm(CNW_z ~ spending_GDP + gini_rev_i, data = thetimes))

CNWlm_3z <- (lm(CNW_z ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes))

CNWlm_4z <- (lm(CNW_z ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes))

CNWlm_0mm <- (lm(CNW_mm ~ 1, data = thetimes))

CNWlm_tmm <- (lm(CNW_mm ~ dyear, data = thetimes))

CNWlm_1mm <- (lm(CNW_mm ~ Penetration + media_autonomy_i, data = thetimes))

CNWlm_2mm <- (lm(CNW_mm ~ spending_GDP + gini_rev_i, data = thetimes))

CNWlm_3mm <- (lm(CNW_mm ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes))

CNWlm_4mm <- (lm(CNW_mm ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes))

CLlm_0z <- (lm(CL_z ~ 1, data = thetimes))

CLlm_tz <- (lm(CL_z ~ dyear, data = thetimes))

CLlm_1z <- (lm(CL_z ~ Penetration + media_autonomy_i, data = thetimes))

CLlm_2z <- (lm(CL_z ~ spending_GDP + gini_rev_i, data = thetimes))

CLlm_3z <- (lm(CL_z ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes))

CLlm_4z <- (lm(CL_z ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes))

CLlm_0mm <- (lm(CL_mm ~ 1, data = thetimes))

CLlm_tmm <- (lm(CL_mm ~ dyear, data = thetimes))

CLlm_1mm <- (lm(CL_mm ~ Penetration + media_autonomy_i, data = thetimes))

CLlm_2mm <- (lm(CL_mm ~ spending_GDP + gini_rev_i, data = thetimes))

CLlm_3mm <- (lm(CL_mm ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes))

CLlm_4mm <- (lm(CL_mm ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes))

df_comp <- data.frame(
  Criterion = rep(c("CL", "CNW"), each = 18),
  Predictor = factor(rep(rep(c("Time", "Crisis Sponsors", "Media Penetr.", "Media Auton.", "Gov. sp. intensity", "Gov. sp. diversity"), each = 3), times = 2), ordered = TRUE, levels = c("Time", "Crisis Sponsors", "Media Penetr.", "Media Auton.", "Gov. sp. intensity", "Gov. sp. diversity")),
  Standardization = factor(rep(c("Unstandardized", "MixMax-standardized", "z-standardized"), times = 12), ordered = TRUE, levels = c("Unstandardized", "MixMax-standardized", "z-standardized")),
  coef =
    c(
      CLlm_t$coef[2], CLlm_tmm$coef[2], CLlm_tz$coef[2],
      CLlm_4$coef[6], CLlm_4mm$coef[6], CLlm_4z$coef[6],
      CLlm_1$coef[2], CLlm_1mm$coef[2], CLlm_1z$coef[2],
      CLlm_1$coef[3], CLlm_1mm$coef[3], CLlm_1z$coef[3],
      CLlm_2$coef[2], CLlm_2mm$coef[2], CLlm_2z$coef[2],
      CLlm_2$coef[3], CLlm_2mm$coef[3], CLlm_2z$coef[3],
      CNWlm_t$coef[2], CNWlm_tmm$coef[2], CNWlm_tz$coef[2],
      CNWlm_4$coef[6], CNWlm_4mm$coef[6], CNWlm_4z$coef[6],
      CNWlm_1$coef[2], CNWlm_1mm$coef[2], CNWlm_1z$coef[2],
      CNWlm_1$coef[3], CNWlm_1mm$coef[3], CNWlm_1z$coef[3],
      CNWlm_2$coef[2], CNWlm_2mm$coef[2], CNWlm_2z$coef[2],
      CNWlm_2$coef[3], CNWlm_2mm$coef[3], CNWlm_2z$coef[3]
    ),
  LL95 =
    c(
      confint(CLlm_t)[2, 1], confint(CLlm_tmm)[2, 1], confint(CLlm_tz)[2, 1],
      confint(CLlm_4)[6, 1], confint(CLlm_4mm)[6, 1], confint(CLlm_4z)[6, 1],
      confint(CLlm_1)[2, 1], confint(CLlm_1mm)[2, 1], confint(CLlm_1z)[2, 1],
      confint(CLlm_1)[3, 1], confint(CLlm_1mm)[3, 1], confint(CLlm_1z)[3, 1],
      confint(CLlm_2)[2, 1], confint(CLlm_2mm)[2, 1], confint(CLlm_2z)[2, 1],
      confint(CLlm_2)[3, 1], confint(CLlm_2mm)[3, 1], confint(CLlm_2z)[3, 1],
      confint(CNWlm_t)[2, 1], confint(CNWlm_tmm)[2, 1], confint(CNWlm_tz)[2, 1],
      confint(CNWlm_4)[6, 1], confint(CNWlm_4mm)[6, 1], confint(CNWlm_4z)[6, 1],
      confint(CNWlm_1)[2, 1], confint(CNWlm_1mm)[2, 1], confint(CNWlm_1z)[2, 1],
      confint(CNWlm_1)[3, 1], confint(CNWlm_1mm)[3, 1], confint(CNWlm_1z)[3, 1],
      confint(CNWlm_2)[2, 1], confint(CNWlm_2mm)[2, 1], confint(CNWlm_2z)[2, 1],
      confint(CNWlm_2)[3, 1], confint(CNWlm_2mm)[3, 1], confint(CNWlm_2z)[3, 1]
    ),
  UL95 =
    c(
      confint(CLlm_t)[2, 2], confint(CLlm_tmm)[2, 2], confint(CLlm_tz)[2, 2],
      confint(CLlm_4)[6, 2], confint(CLlm_4mm)[6, 2], confint(CLlm_4z)[6, 2],
      confint(CLlm_1)[2, 2], confint(CLlm_1mm)[2, 2], confint(CLlm_1z)[2, 2],
      confint(CLlm_1)[3, 2], confint(CLlm_1mm)[3, 2], confint(CLlm_1z)[3, 2],
      confint(CLlm_2)[2, 2], confint(CLlm_2mm)[2, 2], confint(CLlm_2z)[2, 2],
      confint(CLlm_2)[3, 2], confint(CLlm_2mm)[3, 2], confint(CLlm_2z)[3, 2],
      confint(CNWlm_t)[2, 2], confint(CNWlm_tmm)[2, 2], confint(CNWlm_tz)[2, 2],
      confint(CNWlm_4)[6, 2], confint(CNWlm_4mm)[6, 2], confint(CNWlm_4z)[6, 2],
      confint(CNWlm_1)[2, 2], confint(CNWlm_1mm)[2, 2], confint(CNWlm_1z)[2, 2],
      confint(CNWlm_1)[3, 2], confint(CNWlm_1mm)[3, 2], confint(CNWlm_1z)[3, 2],
      confint(CNWlm_2)[2, 2], confint(CNWlm_2mm)[2, 2], confint(CNWlm_2z)[2, 2],
      confint(CNWlm_2)[3, 2], confint(CNWlm_2mm)[3, 2], confint(CNWlm_2z)[3, 2]
    ),
  maxim =
    c(
      CLlm_t$coef[2], CLlm_tmm$coef[2], CLlm_tz$coef[2],
      CLlm_4$coef[6], CLlm_4mm$coef[6], CLlm_4z$coef[6],
      CLlm_1$coef[2], CLlm_1mm$coef[2], CLlm_1z$coef[2],
      CLlm_1$coef[3], CLlm_1mm$coef[3], CLlm_1z$coef[3],
      CLlm_2$coef[2], CLlm_2mm$coef[2], CLlm_2z$coef[2],
      CLlm_2$coef[3], CLlm_2mm$coef[3], CLlm_2z$coef[3],
      CLlm_t$coef[2], CLlm_tmm$coef[2], CLlm_tz$coef[2],
      CLlm_4$coef[6], CLlm_4mm$coef[6], CLlm_4z$coef[6],
      CLlm_1$coef[2], CLlm_1mm$coef[2], CLlm_1z$coef[2],
      CLlm_1$coef[3], CLlm_1mm$coef[3], CLlm_1z$coef[3],
      CLlm_2$coef[2], CLlm_2mm$coef[2], CLlm_2z$coef[2],
      CLlm_2$coef[3], CLlm_2mm$coef[3], CLlm_2z$coef[3]
    )
)

df_comp$coef_1 <- df_comp$coef / df_comp$maxim
df_comp$LL95_1 <- df_comp$LL95 / df_comp$maxim
df_comp$UL95_1 <- df_comp$UL95 / df_comp$maxim

gg_effsizecomp <- ggplot(df_comp, aes(y = coef_1, ymin = LL95_1, ymax = UL95_1, x = Criterion, shape = Criterion, color = Criterion, fill = Criterion)) +
  geom_crossbar(width = 0.9, alpha = .25) +
  geom_point(size = 2.25) +
  facet_grid(Predictor ~ Standardization) +
  scale_y_continuous(breaks = seq(0, 1, 0.25), limits = c(-1.0, 1.5)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  theme_soft() +
  scale_color_viridis_d(option = "viridis", begin = 0, end = 0.65) +
  xlab("Dependent variable") +
  ylab("Regression coefficient, scaled to 0-1 range") +
  theme(legend.position = "bottom") +
  ggtitle("Comparing effect sizes with different standardization methods")



CLlm_t <- (lm(100 * CL_pcent ~ dyear, data = thetimes))

CLlm_1 <- (lm(100 * CL_pcent ~ Penetration + media_autonomy_i, data = thetimes))

CLlm_2 <- (lm(100 * CL_pcent ~ spending_GDP + gini_rev_i, data = thetimes))

CLlm_3 <- (lm(100 * CL_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes))

CLlm_4 <- (lm(100 * CL_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes))

CNWlm_0 <- (lm(100 * CNW_pcent ~ 1, data = thetimes))

CNWlm_t <- (lm(100 * CNW_pcent ~ dyear, data = thetimes))

CNWlm_1 <- (lm(100 * CNW_pcent ~ Penetration + media_autonomy_i, data = thetimes))

CNWlm_2 <- (lm(100 * CNW_pcent ~ spending_GDP + gini_rev_i, data = thetimes))

CNWlm_3 <- (lm(100 * CNW_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes))

CNWlm_4 <- (lm(100 * CNW_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes))

ggsave(gg_year_waves, file = "wave_count_4np.png", unit = "cm", width = 16, height = 16, dpi = 1200, scale = 1.25)

ggsave(gg_year_waveshare, file = "wave_share_4np.png", unit = "cm", width = 16, height = 16, dpi = 1200, scale = 1.25)

ggsave(gg_wave_duration, file = "wave_duration_4np.png", unit = "cm", width = 16, height = 16, dpi = 1200, scale = 1.25)

ggsave(gg_year_clshare, file = "cl_share_4np.png", unit = "cm", width = 16, height = 16, dpi = 1200, scale = 1.25)

ggsave(gg_year_clarticles, file = "cl_articles_4np.png", unit = "cm", width = 16, height = 16, dpi = 1200, scale = 1.25)

ggsave(gg_year_articles, file = "articles_4np.png", unit = "cm", width = 16, height = 16, dpi = 1200, scale = 1.25)

ggsave(gg_year_wavearticles, file = "wave_articles_4np.png", unit = "cm", width = 16, height = 16, dpi = 1200, scale = 1.25)


M1.X <- (lm(CL_pcent ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000 + dyear, data = thetimes))
M1.0 <- (lm(CL_pcent ~ 1, data = M1.X$model))
M1.1 <- (lm(CL_pcent ~ 1 + dyear, data = M1.X$model))
M1.2 <- (lm(CL_pcent ~ 1 + Penetration + media_autonomy_i, data = M1.X$model))
M1.3 <- (lm(CL_pcent ~ 1 + spending_GDP + gini_rev_i, data = M1.X$model))
M1.4 <- (lm(CL_pcent ~ 1 + ccORG1000, data = M1.X$model))
M1.5 <- (lm(CL_pcent ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = M1.X$model))
M1.6 <- (lm(CL_pcent ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = M1.X$model))
M1.7 <- (lm(CL_pcent ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000 + dyear, data = M1.X$model))

M2.X <- (lm(CNW_pcent ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000 + dyear, data = thetimes))
M2.0 <- (lm(CNW_pcent ~ 1, data = M2.X$model))
M2.1 <- (lm(CNW_pcent ~ 1 + dyear, data = M2.X$model))
M2.2 <- (lm(CNW_pcent ~ 1 + Penetration + media_autonomy_i, data = M2.X$model))
M2.3 <- (lm(CNW_pcent ~ 1 + spending_GDP + gini_rev_i, data = M2.X$model))
M2.4 <- (lm(CNW_pcent ~ 1 + ccORG1000, data = M2.X$model))
M2.5 <- (lm(CNW_pcent ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = M2.X$model))
M2.6 <- (lm(CNW_pcent ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = M2.X$model))
M2.7 <- (lm(CNW_pcent ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000 + dyear, data = M2.X$model))

M3.X <- (lm(logCNW ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000 + dyear, data = thetimes))
M3.0 <- (lm(logCNW ~ 1, data = M3.X$model))
M3.1 <- (lm(logCNW ~ 1 + dyear, data = M3.X$model))
M3.2 <- (lm(logCNW ~ 1 + Penetration + media_autonomy_i, data = M3.X$model))
M3.3 <- (lm(logCNW ~ 1 + spending_GDP + gini_rev_i, data = M3.X$model))
M3.4 <- (lm(logCNW ~ 1 + ccORG1000, data = M3.X$model))
M3.5 <- (lm(logCNW ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = M3.X$model))
M3.6 <- (lm(logCNW ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = M3.X$model))
M3.7 <- (lm(logCNW ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000 + dyear, data = M3.X$model))

M4.X <- (lm(ccORG1000 ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + dyear, data = thetimes))
M4.0 <- (lm(ccORG1000 ~ 1 + Penetration + media_autonomy_i, data = M4.X$model))
M4.1 <- (lm(ccORG1000 ~ 1 + spending_GDP + gini_rev_i, data = M4.X$model))
M4.2 <- (lm(ccORG1000 ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = M4.X$model))
M4.3 <- (lm(ccORG1000 ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + dyear, data = M4.X$model))


anova(M1.0, M1.1)
anova(M1.0, M1.4)
anova(M1.0, M1.2, M1.5, M1.6, M1.7)
anova(M1.0, M1.3, M1.5, M1.6, M1.7)

M1_r2_all <- cbind(r2(M1.0), r2(M1.1), r2(M1.2), r2(M1.3), r2(M1.4), r2(M1.5), r2(M1.6), r2(M1.7))

M1_r2_ml_first <- cbind(r2(M1.0), r2(M1.2), r2(M1.5), r2(M1.6), r2(M1.7))

M1_r2_gs_first <- cbind(r2(M1.0), r2(M1.3), r2(M1.5), r2(M1.6), r2(M1.7))

stargazer(M1.0, M1.2, M1.5, M1.6, M1.7, M1.1, type = "text", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)
stargazer(M1.2, M1.5, M1.6, M1.7, M1.1, type = "text", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)

stargazer(M1.2, M1.5, M1.6, M1.7, M1.1, type = "latex", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = FALSE)


anova(M2.0, M2.1)
anova(M2.0, M2.4)
anova(M2.0, M2.2, M2.5, M2.6, M2.7)
anova(M2.0, M2.3, M2.5, M2.6, M2.7)

M2_r2_all <- cbind(r2(M2.0), r2(M2.1), r2(M2.2), r2(M2.3), r2(M2.4), r2(M2.5), r2(M2.6), r2(M2.7))

M2_r2_ml_first <- cbind(r2(M2.0), r2(M2.2), r2(M2.5), r2(M2.6), r2(M2.7))

M2_r2_gs_first <- cbind(r2(M2.0), r2(M2.3), r2(M2.5), r2(M2.6), r2(M2.7))

stargazer(M2.0, M2.2, M2.5, M2.6, M2.7, M2.1, type = "text", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)
stargazer(M2.2, M2.5, M2.6, M2.7, M2.1, type = "text", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)

stargazer(M2.2, M2.5, M2.6, M2.7, M2.1, type = "latex", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = FALSE)


anova(M3.0, M3.1)
anova(M3.0, M3.4)
anova(M3.0, M3.2, M3.5, M3.6, M3.7)
anova(M3.0, M3.3, M3.5, M3.6, M3.7)

M3_r2_all <- cbind(r2(M3.0), r2(M3.1), r2(M3.2), r2(M3.3), r2(M3.4), r2(M3.5), r2(M3.6), r2(M3.7))

M3_r2_ml_first <- cbind(r2(M3.0), r2(M3.2), r2(M3.5), r2(M3.6), r2(M3.7))

M3_r2_gs_first <- cbind(r2(M3.0), r2(M3.3), r2(M3.5), r2(M3.6), r2(M3.7))

stargazer(M3.0, M3.2, M3.5, M3.6, M3.7, M3.1, type = "text", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)
stargazer(M3.2, M3.5, M3.6, M3.7, M3.1, type = "text", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)


stargazer(M1.5, M1.7, M2.5, M2.7, M3.5, M3.7, type = "text", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = FALSE)

stargazer(M1.7, M2.7, M3.7, type = "text", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = FALSE)
stargazer(M3.2, M3.5, M3.6, M3.7, M3.1, type = "latex", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = FALSE)

etaSquared(M1.7)

M1.8_MP <- mediate(model.y = M1.6, model.m = M4.2, treat = "Penetration", mediator = "ccORG1000")
M1.8_MA <- mediate(model.y = M1.6, model.m = M4.2, treat = "media_autonomy_i", mediator = "ccORG1000")
M1.8_SI <- mediate(model.y = M1.6, model.m = M4.2, treat = "spending_GDP", mediator = "ccORG1000")
M1.8_SD <- mediate(model.y = M1.6, model.m = M4.2, treat = "gini_rev_i", mediator = "ccORG1000")

df_mediation_M1 <- data.frame(
  Criterion = c("CL Salience", "CL Salience", "CL Salience", "CL Salience"),
  Treatment = c("Media Penetration", "Media Autonomy", "Spending Intensity", "Spending Diversity"),
  Mediator = c("ccORG1000", "ccORG1000", "ccORG1000", "ccORG1000"),
  IE.est = c(M1.8_MP$d0, M1.8_MA$d0, M1.8_SI$d0, M1.8_SD$d0),
  IE.LL95 = c(M1.8_MP$d0.ci[1], M1.8_MA$d0.ci[1], M1.8_SI$d0.ci[1], M1.8_SD$d0.ci[1]),
  IE.UL95 = c(M1.8_MP$d0.ci[2], M1.8_MA$d0.ci[2], M1.8_SI$d0.ci[2], M1.8_SD$d0.ci[2]),
  IE.p = c(M1.8_MP$d0.p, M1.8_MA$d0.p, M1.8_SI$d0.p, M1.8_SD$d0.p),
  DE.est = c(M1.8_MP$z0, M1.8_MA$z0, M1.8_SI$z0, M1.8_SD$z0),
  DE.LL95 = c(M1.8_MP$z0.ci[1], M1.8_MA$z0.ci[1], M1.8_SI$z0.ci[1], M1.8_SD$z0.ci[1]),
  DE.UL95 = c(M1.8_MP$z0.ci[2], M1.8_MA$z0.ci[2], M1.8_SI$z0.ci[2], M1.8_SD$z0.ci[2]),
  DE.p = c(M1.8_MP$z0.p, M1.8_MA$z0.p, M1.8_SI$z0.p, M1.8_SD$z0.p),
  TE.est = c(M1.8_MP$tau.coef, M1.8_MA$tau.coef, M1.8_SI$tau.coef, M1.8_SD$tau.coef),
  TE.LL95 = c(M1.8_MP$tau.ci[1], M1.8_MA$tau.ci[1], M1.8_SI$tau.ci[1], M1.8_SD$tau.ci[1]),
  TE.UL95 = c(M1.8_MP$tau.ci[2], M1.8_MA$tau.ci[2], M1.8_SI$tau.ci[2], M1.8_SD$tau.ci[2]),
  TE.p = c(M1.8_MP$tau.p, M1.8_MA$tau.p, M1.8_SI$tau.p, M1.8_SD$tau.p)
)
df_mediation_M2 <- data.frame(
  Criterion = c("CNW Salience", "CNW Salience", "CNW Salience", "CNW Salience"),
  Treatment = c("Media Penetration", "Media Autonomy", "Spending Intensity", "Spending Diversity"),
  Mediator = c("ccORG1000", "ccORG1000", "ccORG1000", "ccORG1000"),
  IE.est = c(M2.8_MP$d0, M2.8_MA$d0, M2.8_SI$d0, M2.8_SD$d0),
  IE.LL95 = c(M2.8_MP$d0.ci[1], M2.8_MA$d0.ci[1], M2.8_SI$d0.ci[1], M2.8_SD$d0.ci[1]),
  IE.UL95 = c(M2.8_MP$d0.ci[2], M2.8_MA$d0.ci[2], M2.8_SI$d0.ci[2], M2.8_SD$d0.ci[2]),
  IE.p = c(M2.8_MP$d0.p, M2.8_MA$d0.p, M2.8_SI$d0.p, M2.8_SD$d0.p),
  DE.est = c(M2.8_MP$z0, M2.8_MA$z0, M2.8_SI$z0, M2.8_SD$z0),
  DE.LL95 = c(M2.8_MP$z0.ci[1], M2.8_MA$z0.ci[1], M2.8_SI$z0.ci[1], M2.8_SD$z0.ci[1]),
  DE.UL95 = c(M2.8_MP$z0.ci[2], M2.8_MA$z0.ci[2], M2.8_SI$z0.ci[2], M2.8_SD$z0.ci[2]),
  DE.p = c(M2.8_MP$z0.p, M2.8_MA$z0.p, M2.8_SI$z0.p, M2.8_SD$z0.p),
  TE.est = c(M2.8_MP$tau.coef, M2.8_MA$tau.coef, M2.8_SI$tau.coef, M2.8_SD$tau.coef),
  TE.LL95 = c(M2.8_MP$tau.ci[1], M2.8_MA$tau.ci[1], M2.8_SI$tau.ci[1], M2.8_SD$tau.ci[1]),
  TE.UL95 = c(M2.8_MP$tau.ci[2], M2.8_MA$tau.ci[2], M2.8_SI$tau.ci[2], M2.8_SD$tau.ci[2]),
  TE.p = c(M2.8_MP$tau.p, M2.8_MA$tau.p, M2.8_SI$tau.p, M2.8_SD$tau.p)
)
df_mediation_M3 <- data.frame(
  Criterion = c("CNW Count", "CNW Count", "CNW Count", "CNW Count"),
  Treatment = c("Media Penetration", "Media Autonomy", "Spending Intensity", "Spending Diversity"),
  Mediator = c("ccORG1000", "ccORG1000", "ccORG1000", "ccORG1000"),
  IE.est = c(M3.8_MP$d0, M3.8_MA$d0, M3.8_SI$d0, M3.8_SD$d0),
  IE.LL95 = c(M3.8_MP$d0.ci[1], M3.8_MA$d0.ci[1], M3.8_SI$d0.ci[1], M3.8_SD$d0.ci[1]),
  IE.UL95 = c(M3.8_MP$d0.ci[2], M3.8_MA$d0.ci[2], M3.8_SI$d0.ci[2], M3.8_SD$d0.ci[2]),
  IE.p = c(M3.8_MP$d0.p, M3.8_MA$d0.p, M3.8_SI$d0.p, M3.8_SD$d0.p),
  DE.est = c(M3.8_MP$z0, M3.8_MA$z0, M3.8_SI$z0, M3.8_SD$z0),
  DE.LL95 = c(M3.8_MP$z0.ci[1], M3.8_MA$z0.ci[1], M3.8_SI$z0.ci[1], M3.8_SD$z0.ci[1]),
  DE.UL95 = c(M3.8_MP$z0.ci[2], M3.8_MA$z0.ci[2], M3.8_SI$z0.ci[2], M3.8_SD$z0.ci[2]),
  DE.p = c(M3.8_MP$z0.p, M3.8_MA$z0.p, M3.8_SI$z0.p, M3.8_SD$z0.p),
  TE.est = c(M3.8_MP$tau.coef, M3.8_MA$tau.coef, M3.8_SI$tau.coef, M3.8_SD$tau.coef),
  TE.LL95 = c(M3.8_MP$tau.ci[1], M3.8_MA$tau.ci[1], M3.8_SI$tau.ci[1], M3.8_SD$tau.ci[1]),
  TE.UL95 = c(M3.8_MP$tau.ci[2], M3.8_MA$tau.ci[2], M3.8_SI$tau.ci[2], M3.8_SD$tau.ci[2]),
  TE.p = c(M3.8_MP$tau.p, M3.8_MA$tau.p, M3.8_SI$tau.p, M3.8_SD$tau.p)
)

df_mediation <- rbind(df_mediation_M1, df_mediation_M2, df_mediation_M3)

df_mediation_long <- data.frame(
  Criterion = factor(rep(df_mediation$Criterion, times = 3), ordered = TRUE, levels = c("CL Salience", "CNW Salience", "CNW Count")),
  Treatment = factor(rep(df_mediation$Treatment, times = 3), ordered = TRUE, levels = c("Media Penetration", "Media Autonomy", "Spending Intensity", "Spending Diversity")),
  Mediator = rep(df_mediation$Mediator, times = 3),
  Effect = factor(rep(c("Indirect", "Direct", "Total"), each = 12), ordered = TRUE, levels = c("Indirect", "Direct", "Total")),
  Est = c(df_mediation$IE.est, df_mediation$DE.est, df_mediation$TE.est),
  LL95 = c(df_mediation$IE.LL95, df_mediation$DE.LL95, df_mediation$TE.LL95),
  UL95 = c(df_mediation$IE.UL95, df_mediation$DE.UL95, df_mediation$TE.UL95)
)

ggplot(df_mediation_long, aes(ymin = LL95, ymax = UL95, y = Est, x = Criterion, group = Effect, color = Effect, shape = Effect)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_pointrange(position = position_dodge(0.75)) +
  facet_grid(Treatment ~ .) +
  coord_flip() +
  theme_soft() +
  scale_color_viridis_d(begin = 0, end = .85)

gg_mediation <- ggplot(df_mediation_long, aes(ymin = LL95, ymax = UL95, y = Est, x = Criterion, group = Effect, color = Effect, shape = Effect)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_pointrange(position = position_dodge(0.75)) +
  facet_grid(Treatment ~ ., scales = "free_y") +
  theme_soft() +
  scale_color_viridis_d(begin = 0, end = .7) +
  ylab("Estimate")

gg_mediation <- ggplot(df_mediation_long, aes(ymin = LL95, ymax = UL95, y = Est, x = Effect, group = Effect, color = Effect, shape = Effect)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_errorbar(width = 0.9, position = position_dodge(0.75)) +
  geom_point(size = 2.0) +
  facet_grid(Treatment ~ Criterion, scales = "free_y") +
  theme_soft() +
  scale_color_viridis_d(begin = 0, end = .7) +
  ylab("Estimate") +
  theme(legend.position = "bottom") +
  ggtitle("Direct, indirect and total effects of predictors")

gg_mediation_effsizecomp <- grid.arrange(gg_mediation, gg_effsizecomp, ncol = 2)

ggsave(gg_mediation, file = "mediation.png", unit = "cm", width = 12, height = 12, dpi = 1200, scale = 1.25)

ggsave(gg_mediation_effsizecomp, file = "mediation_effsizecomp.png", unit = "cm", width = 16, height = 16, dpi = 1200, scale = 1.85)


M2.8_MP <- mediate(model.y = M2.6, model.m = M4.2, treat = "Penetration", mediator = "ccORG1000")
M2.8_MA <- mediate(model.y = M2.6, model.m = M4.2, treat = "media_autonomy_i", mediator = "ccORG1000")
M2.8_SI <- mediate(model.y = M2.6, model.m = M4.2, treat = "spending_GDP", mediator = "ccORG1000")
M2.8_SD <- mediate(model.y = M2.6, model.m = M4.2, treat = "gini_rev_i", mediator = "ccORG1000")

M3.8_MP <- mediate(model.y = M3.6, model.m = M4.2, treat = "Penetration", mediator = "ccORG1000")
M3.8_MA <- mediate(model.y = M3.6, model.m = M4.2, treat = "media_autonomy_i", mediator = "ccORG1000")
M3.8_SI <- mediate(model.y = M3.6, model.m = M4.2, treat = "spending_GDP", mediator = "ccORG1000")
M3.8_SD <- mediate(model.y = M3.6, model.m = M4.2, treat = "gini_rev_i", mediator = "ccORG1000")



med_penetration3 <- mediate(model.m = ORGlm_3, model.y = COUNTlm_3Y, mediator = "ccORG1000", treat = "Penetration")
med_autonomy3 <- mediate(model.m = ORGlm_3, model.y = COUNTlm_3Y, mediator = "ccORG1000", treat = "media_autonomy_i")
med_spending3 <- mediate(model.m = ORGlm_3, model.y = COUNTlm_3Y, mediator = "ccORG1000", treat = "spending_GDP")
med_diversity3 <- mediate(model.m = ORGlm_3, model.y = COUNTlm_3Y, mediator = "ccORG1000", treat = "gini_rev_i")

med_penetration3X <- mediate(model.m = ORGlm_1, model.y = COUNTlm_3X, mediator = "ccORG1000", treat = "Penetration")
med_autonomy3X <- mediate(model.m = ORGlm_1, model.y = COUNTlm_3X, mediator = "ccORG1000", treat = "media_autonomy_i")
med_spending3X <- mediate(model.m = ORGlm_2, model.y = COUNTlm_3X, mediator = "ccORG1000", treat = "spending_GDP")
med_diversity3X <- mediate(model.m = ORGlm_2, model.y = COUNTlm_3X, mediator = "ccORG1000", treat = "gini_rev_i")

6.1.2 Time series analysis

Code
###################################################
#### Load data
load("to20_year.RData")
load("to50_year.RData")
load("to250_year.RData")
load("wd250x.RData")
load("to_lab.RData")
load("spend.RData")
load("thetimes.RData")
load("df_bes_year.RData")
load("media_indicators.RData")
load("co_vdem.RData")

to250_year$dyear <- to250_year$year + 1784

##################################################
#### Create a data frame of the number of CRISIS NEWS WAVES
####

fifty <- c(1800, 1850, 1900, 1950, 2000)
y25 <- c(1785, 1800, 1825, 1850, 1875, 1900, 1925, 1950, 1975, 2000, 2020, 2025, 2050)

cnw.count <- (table(wd250x$year))

df.cnw.count <- data.frame(year = 1785:2020, cnw.count = 0)
df.cnw.count$cnw.count <- cnw.count[match(df.cnw.count$year, as.numeric(names(cnw.count)))]
df.cnw.count$cnw <- ifelse(is.na(df.cnw.count$cnw.count), 0, df.cnw.count$cnw.count)

count_cnw_year <- ggplot(df.cnw.count, aes(x = year, y = cnw)) +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Count of crisis news waves") +
  geom_text(aes(label = year, size = cnw)) +
  scale_y_log10(breaks = c(5, 10, 20, 30, 40, 50))

ggsave(count_cnw_year, file = "soft_count_cnw_year.svg", unit = "cm", width = 20, height = 10, dpi = 1200, scale = 1.25)

lm.cnw_count.year <- (lm(cnw ~ I(year - 1785), data = df.cnw.count))
summary(lm.cnw_count.year)
confint(lm.cnw_count.year)
pred.cnw_count.year <- predict(lm.cnw_count.year, newdata = data.frame(year = y25))
names(pred.cnw_count.year) <- y25
### B=.026 (.006); t=4.092; p<.001

to20_2020 <- subset(to20_year, dyear == 2020)
to20_2020$cumshare <- cumsum(to20_2020$share)
to20_2020$tocol <- viridis(21)

to20_1785 <- subset(to20_year, dyear == 1785)
to20_1785$cumshare <- cumsum(to20_1785$share)
to20_1785$tocol <- viridis(21)

thetimes$active250 <- NA
thetimes$active50 <- NA
thetimes$active20 <- NA

active250 <- tapply(to250_year$share > 0.001, to250_year$dyear, mean)
active50 <- tapply(to50_year$share > 0.001, to50_year$dyear, mean)
active20 <- tapply(to20_year$share > 0.001, to20_year$dyear, mean)

thetimes$active250 <- active250[match(thetimes$year, names(active250))]
thetimes$active50 <- active50[match(thetimes$year, names(active50))]
thetimes$active20 <- active20[match(thetimes$year, names(active20))]

thetimes3a <- pivot_longer(thetimes, cols = c("active250", "active50", "active20"))

active_time_cl <- ggplot(thetimes3a, aes(color = name, fill = name, y = 100 * value, x = year)) +
  geom_point() +
  geom_smooth() +
  scale_color_viridis_d(begin = 0.1, end = 0.8) +
  scale_fill_viridis_d(begin = 0.1, end = 0.8) +
  theme_bluewhite() +
  xlab("Year") +
  ylab("Share of active topics") +
  ggtitle("Crisis labelling coverage (159'732)") +
  annotate("label", fill = viridis(3, begin = 0.1, end = 0.8)[c(1, 3, 2)], color = "white", x = c(1900, 1825, 1950), y = c(95, 75, 70), label = c("topic areas(20)", "topic complexes(163)", "topics(250)")) +
  theme(legend.position = "none")

to20_year$decade <- 10 * floor(to20_year$dyear / 10)
to20_decade <- aggregate(to20_year$share, by = list(to20_year$topic, to20_year$decade), FUN = "mean", na.rm = TRUE)
to20_decade$cumx <- unlist(by(to20_decade$x, INDICES = to20_decade$Group.2, cumsum))
to20_decade$fontcolor <- rep(c(rep(c("white"), times = 10), rep(c("black"), times = 11)), times = 25)
to20_decade$pattern <- rep(c(rep(c("white", "black"), times = 10), "white"), times = 25)

patternscale <- c("none", "stripe", "weave", "circle", "none", "stripe", "weave", "circle", "none", "stripe", "weave", "circle", "none", "stripe", "weave", "circle", "none", "stripe", "weave", "circle", "none")

gg <- ggplot(to20_decade, aes(x = Group.2, label = Group.1, fill = Group.1)) +
  geom_col_pattern(aes(y = x, pattern = Group.1), fill = "white", pattern_density = 0.1, pattern_spacing = 0.01) +
  geom_text(aes(color = fontcolor, y = 1 - (cumx - (x / 2))), hjust = 0.5, vjust = 0.5) +
  theme_bluewhite() +
  xlab("Decade") +
  ylab("Share") +
  scale_pattern_fill_viridis_d() +
  scale_color_viridis_d() +
  scale_pattern_manual(values = patternscale)


ggplot(to20_decade, aes(x = Group.2, fill = Group.1, label = Group.1)) +
  geom_col(aes(y = x)) +
  geom_text(aes(color = fontcolor, y = 1 - (cumx - (x / 2))), hjust = 0.5, vjust = 0.5) +
  theme_bluewhite() +
  xlab("Decade") +
  ylab("Share") +
  scale_fill_viridis_d() +
  scale_color_viridis_d()

ggplot(to20_year, aes(fill = topic, y = share * 100, x = dyear, pattern = topic)) +
  geom_area(color = "white") +
  scale_fill_viridis_d() +
  annotate("text", label = to20_2020$topic, x = c(rep(c(2021, 2031), times = 10), 2021), color = to20_2020$tocol, y = ((100 - (to20_2020$cumshare * 100)) + (100 * to20_2020$share / 2)), hjust = 0) +
  annotate("text", label = to20_1785$topic, x = c(rep(c(1762, 1772), times = 10), 1771), color = to20_1785$tocol, y = ((100 - (to20_1785$cumshare * 100)) + (100 * to20_1785$share / 2)), hjust = 0) +
  theme_bluewhite() +
  ylab("Topic's share") +
  xlab("Year")

ggsave(active_time_cl, file = "active_time_cl.svg", device = "svg", unit = "cm", width = 16, height = 12, dpi = 1200, scale = 1.25)



thetimes$gini <- NA
for (i in 1785:2020)
{
  thetimes[thetimes$year == i, "gini"] <- Gini(subset(to20_year, dyear == i)$share)
}

thetimes$gini50 <- NA
for (i in 1785:2020)
{
  thetimes[thetimes$year == i, "gini50"] <- Gini(subset(to50_year, dyear == i)$share)
}

thetimes$gini250 <- NA
for (i in 1785:2020)
{
  thetimes[thetimes$year == i, "gini250"] <- Gini(subset(to250_year, dyear == i)$share)
}


ggplot(thetimes, aes(y = gini, x = year)) +
  geom_point() +
  geom_smooth() +
  ylim(0, 1) +
  theme_bluewhite()
ggplot(thetimes, aes(y = gini50, x = year)) +
  geom_point() +
  geom_smooth() +
  ylim(0, 1) +
  theme_bluewhite()
ggplot(thetimes, aes(y = gini250, x = year)) +
  geom_point() +
  geom_smooth() +
  ylim(0, 1) +
  theme_bluewhite()

thetimes3 <- pivot_longer(data = thetimes, cols = c("gini", "gini50", "gini250"))
thetimes3$var <- Recode(thetimes3$name, "'gini'='topic areas(20)';'gini50'='topic complexes(50)';'gini250'='topic(250)'")

gini_time_cl <- ggplot(thetimes3, aes(y = 1 - value, x = year, color = var, fill = var, shape = var)) +
  geom_point() +
  geom_smooth() +
  ylim(0, 1) +
  scale_color_viridis_d(begin = .1, end = .8) +
  scale_fill_viridis_d(begin = .1, end = .8) +
  theme_bluewhite() +
  ylab("1- Gini coefficient") +
  xlab("Year") +
  ggtitle("Crisis labelling coverage (159'732)") +
  annotate("label", x = c(1900, 1850, 1825), y = c(0.25, 0.330, 0.500), color = viridis(3, begin = .1, end = .8)[c(2, 3, 1)], label = c("Topic complexes(163)", "Topics(250)", "Topic areas(20)")) +
  theme(legend.position = "none")

ggsave(gini_time_cl, file = "gini_time_cl.svg", device = "svg", unit = "cm", width = 16, height = 12, dpi = 1200, scale = 1.25)

thetimes$time <- thetimes$year - 1784

lm.gini.0 <- lm(gini ~ 1, data = thetimes)
lm.gini.1 <- lm(gini ~ poly(time, 1), data = thetimes)
lm.gini.2 <- lm(gini ~ poly(time, 2), data = thetimes)

lm.gini50.0 <- lm(gini50 ~ 1, data = thetimes)
lm.gini50.1 <- lm(gini50 ~ poly(time, 1), data = thetimes)
lm.gini50.2 <- lm(gini50 ~ poly(time, 2), data = thetimes)

lm.gini250.0 <- lm(gini250 ~ 1, data = thetimes)
lm.gini250.1 <- lm(gini250 ~ poly(time, 1), data = thetimes)
lm.gini250.2 <- lm(gini250 ~ poly(time, 2), data = thetimes)

anova(lm.gini.0, lm.gini.1, lm.gini.2)
anova(lm.gini50.0, lm.gini50.1, lm.gini50.2)
anova(lm.gini250.0, lm.gini250.1, lm.gini250.2)

summary(lm(gini ~ poly(time, 2), data = thetimes))
summary(lm(gini50 ~ poly(time, 2), data = thetimes))
summary(lm(gini250 ~ poly(time, 2), data = thetimes))

summary(lm(gini ~ poly(time, 1), data = thetimes))
summary(lm(gini50 ~ poly(time, 1), data = thetimes))
summary(lm(gini250 ~ poly(time, 1), data = thetimes))


wd250x$decade <- floor(as.numeric(wd250x$year) / 10) * 10
wd250x$topic20 <- to.lab[(match(wd250x$topic250, to.lab$topic250)), "topic20"]

topicno <- cbind(names(table(to.lab$topic20)), 1:21)

wd250x$topic20_num <- as.numeric(topicno[match(wd250x$topic20, topicno[, 1]), 2])
wd250x$topic20_num <- Recode(wd250x$topic20, "'POL'=1;'GEO'=2;'EPI'=3;'DIS'=4;'PUB'=5;'WEL'=6;'SCI'=7;'MIL'=8;'ECO'=9;'DOM'=10;'REL'=11;'TRA'=12;'INF'=13;'HEA'=14;'LAB'=15;'ENV'=16;'LEI'=17;'NRG'=18;'EDU'=19")

wd250x$topic20_ord <- factor(wd250x$topic20, levels = c("POL", "GEO", "EPI", "DIS", "PUB", "WEL", "SCI", "MIL", "ECO", "DOM", "REL", "TRA", "INF", "HEA", "LAB", "ENV", "LEI", "NRG", "EDU"), ordered = TRUE)


wd250x$coord <- runif(min = 0, max = 0.5, n = 1052)

color.code.list <- data.frame(topic20 = unique(wd250x$topic20), color.codes = (scales::viridis_pal()(length(unique(wd250x$topic20)))))

wd250x$color20 <- color.code.list[match(wd250x$topic20, color.code.list$topic20), "color.codes"]

wd250x$topic50 <- to.lab[match(wd250x$topic250, to.lab$topic250), "topic50"]

nw250 <- colSums(table(wd250x$topic250, wd250x$year) > 0)
nw50 <- colSums(table(wd250x$topic50, wd250x$year) > 0)
nw20 <- colSums(table(wd250x$topic20, wd250x$year) > 0)

thetimes$nw250 <- nw250[match(thetimes$year, names(nw250))]
thetimes$nw50 <- nw50[match(thetimes$year, names(nw50))]
thetimes$nw20 <- nw20[match(thetimes$year, names(nw20))]

nw250d <- colSums(table(wd250x$topic250, wd250x$decade) > 0) / 250
nw50d <- colSums(table(wd250x$topic50, wd250x$decade) > 0) / 163
nw20d <- colSums(table(wd250x$topic20, wd250x$decade) > 0) / 21


wd250x$baseline <- rowMeans(cbind(wd250x$baseline90, wd250x$baseline180, wd250x$baseline365, wd250x$baseline1461, wd250x$baseline3652), na.rm = TRUE)
wd250x$volume2 <- (wd250x$intensity + wd250x$baseline) * wd250x$duration
wd250x$intensity2 <- wd250x$intensity + wd250x$baseline

cnw.share <- tapply(wd250x$volume2, wd250x$year, sum, na.rm = TRUE)
cl.share <- to20_year[1:235, ]

thetimes$cnw.count <- cnw.share[match(thetimes$year, names(cnw.share))]
thetimes$cl.count <- cl.share[match(thetimes$year, cl.share$dyear), "total.count"]
thetimes$cnw.count <- replace(thetimes$cnw.count, is.na(thetimes$cnw.count), 0)
thetimes$cl.count <- replace(thetimes$cl.count, is.na(thetimes$cl.count), 0)
thetimes$cnw.share <- thetimes$cnw.count / thetimes$articles
thetimes$cl.share <- thetimes$cl.count / thetimes$articles
thetimes$cnw_to_cl.share <- thetimes$cnw.count / thetimes$cl.count
thetimes$cnw <- df.cnw.count$cnw


thetimes.d <- data.frame(decade = names(nw250d), nw250d = nw250d, nw50d = nw50d, nw20d = nw20d)

thetimes.d3 <- pivot_longer(thetimes.d, cols = c("nw20d", "nw50d", "nw250d"))

active_time_cnw <- ggplot(thetimes.d3, aes(x = as.numeric(decade), y = 100 * value, color = name, fill = name, shape = name)) +
  geom_point() +
  geom_smooth() +
  theme_bluewhite() +
  theme(legend.position = "none") +
  ylim(0, 100) +
  annotate("label", label = c("areas(20)", "complexes(163)", "topics(250)"), x = c(1880, 1940, 1875), y = c(65, 4, 23), fill = viridis(3, begin = .1, end = .8), color = "white") +
  ylab("Share of active topics per decade") +
  ggtitle("Crisis news wave coverage (n=17'338)") +
  scale_fill_viridis_d(begin = .1, end = .8) +
  scale_color_viridis_d(begin = .1, end = .8)

ggsave(active_time_cnw, file = "active_time_cnw.svg", device = "svg", unit = "cm", width = 16, height = 12, dpi = 1200, scale = 1.25)


thetimes3b <- pivot_longer(thetimes, cols = c("nw250", "nw50", "nw20"))
thetimes3b$share <- ifelse(thetimes3b$name == "nw250", thetimes3b$value / 250, ifelse(thetimes3b$name == "nw50", thetimes3b$value / 163, thetimes3b$value / 21))

ggplot(thetimes3b, aes(y = 100 * share, x = year, color = name, fill = name, shape = name)) +
  geom_point() +
  geom_smooth() +
  ylab("Share of active topics") +
  xlab("Year") +
  theme_bluewhite() +
  scale_color_viridis_d(begin = 0.1, end = 0.8) +
  scale_fill_viridis_d(begin = 0.1, end = 0.8)

ggplot(thetimes3b, aes(y = 100 * share, x = year, color = name, fill = name, shape = name)) +
  geom_smooth() +
  ylab("Share of active topics") +
  xlab("Year") +
  theme_bluewhite() +
  scale_color_viridis_d(begin = 0.1, end = 0.8) +
  scale_fill_viridis_d(begin = 0.1, end = 0.8)

ggplot(subset(thetimes3b, !(name == "nw20")), aes(y = 100 * share, x = year, color = name, fill = name, shape = name)) +
  geom_point() +
  geom_smooth() +
  ylab("Share of active topics") +
  xlab("Year") +
  theme_bluewhite() +
  theme(legend.position = "none") +
  scale_color_viridis_d(begin = 0.1, end = 0.8) +
  scale_fill_viridis_d(begin = 0.1, end = 0.8) +
  annotate("label", x = c(1870, 1880), y = c(5, 2), label = c("Topic complexes(163)", "Topics(50)"), fill = viridis(2, end = 0.1, begin = 0.8), color = "white")


nw.to20 <- aggregate(wd250x$volume2, by = list(wd250x$decade, wd250x$topic20), FUN = "sum", drop = FALSE)
nw.to20$value <- replace(nw.to20$x, is.na(nw.to20$x), 0)

nw.to50 <- aggregate(wd250x$volume2, by = list(wd250x$decade, wd250x$topic50), FUN = "sum", drop = FALSE)
nw.to50$value <- replace(nw.to50$x, is.na(nw.to50$x), 0)

nw.to250 <- aggregate(wd250x$volume2, by = list(wd250x$decade, wd250x$topic250), FUN = "sum", drop = FALSE)
nw.to250$value <- replace(nw.to250$x, is.na(nw.to250$x), 0)

nwd.to20 <- aggregate(wd250x$volume2, by = list(wd250x$decade, wd250x$topic20), FUN = "sum", drop = FALSE)
nwd.to20$value <- replace(nw.to20$x, is.na(nw.to20$x), 0)

nwd.to50 <- aggregate(wd250x$volume2, by = list(wd250x$decade, wd250x$topic50), FUN = "sum", drop = FALSE)
nwd.to50$value <- replace(nw.to50$x, is.na(nw.to50$x), 0)

nwd.to250 <- aggregate(wd250x$volume2, by = list(wd250x$decade, wd250x$topic250), FUN = "sum", drop = FALSE)
nwd.to250$value <- replace(nw.to250$x, is.na(nw.to250$x), 0)

cnw_doc250 <- pivot_longer(subset(topics_doc, id %in% cnw_ids), cols = paste0("X", 1:250))
cnw_doc250$topic250 <- to.lab[match(cnw_doc250$name, to.lab$topic), "topic250"]
cnw_doc250$topic50 <- to.lab[match(cnw_doc250$name, to.lab$topic), "topic50"]
cnw_doc250$topic20 <- to.lab[match(cnw_doc250$name, to.lab$topic), "topic20"]


cnw_doc250 %>%
  group_by(topic250, year) %>%
  summarise(val = sum(value, na.rm = TRUE)) -> cnw_ts250
cnw_doc250 %>%
  group_by(topic50, year) %>%
  summarise(val = sum(value, na.rm = TRUE)) -> cnw_ts50
cnw_doc250 %>%
  group_by(topic20, year) %>%
  summarise(val = sum(value, na.rm = TRUE)) -> cnw_ts20

year <- 1785:2020
topic20 <- unique(to.lab$topic20)

cnw_ts20 %>%
  complete(year, fill = list(val = 1), explicit = FALSE) -> cnw_ts20e

cnw_ts50 %>%
  complete(year, fill = list(val = 1), explicit = FALSE) -> cnw_ts50e

cnw_ts250 %>%
  complete(year, fill = list(val = 1), explicit = FALSE) -> cnw_ts250e

rep(unclass(cnw_ts20 %>% group_by(year) %>% summarise(sum(val)))[[2]], times = 22) -> cnw_ts20$total
rep(unclass(cnw_ts50 %>% group_by(year) %>% summarise(sum(val)))[[2]], times = 162) -> cnw_ts50$total
rep(unclass(cnw_ts250 %>% group_by(year) %>% summarise(sum(val)))[[2]], times = 250) -> cnw_ts250$total

cnw_ts20$share <- cnw_ts20$val / cnw_ts20$total
cnw_ts50$share <- cnw_ts50$val / cnw_ts50$total
cnw_ts250$share <- cnw_ts250$val / cnw_ts250$total

cnw_year_gini20 <- cnw_ts20 %>%
  group_by(year) %>%
  summarise(gini20 = Gini(val))

cnw_year_gini50 <- cnw_ts50 %>%
  group_by(year) %>%
  summarise(gini50 = Gini(val))

cnw_year_gini250 <- cnw_ts250 %>%
  group_by(year) %>%
  summarise(gini250 = Gini(val))

cnw_year_active20 <- cnw_ts20 %>%
  group_by(year) %>%
  summarise(active20 = sum(share > 0.001) / 22)

cnw_year_active50 <- cnw_ts50 %>%
  group_by(year) %>%
  summarise(active50 = sum(share > 0.001) / 162)

cnw_year_active250 <- cnw_ts250 %>%
  group_by(year) %>%
  summarise(active250 = sum(share > 0.001) / 250)


thetimes$cnw_gini20 <- unclass(cnw_year_gini20[match(thetimes$year, cnw_year_gini20$year), "gini20"])[[1]]
thetimes$cnw_gini50 <- unclass(cnw_year_gini50[match(thetimes$year, cnw_year_gini50$year), "gini50"])[[1]]
thetimes$cnw_gini250 <- unclass(cnw_year_gini250[match(thetimes$year, cnw_year_gini250$year), "gini250"])[[1]]

thetimes$cnw_active20 <- unclass(cnw_year_active20[match(thetimes$year, cnw_year_active20$year), "active20"])[[1]]
thetimes$cnw_active50 <- unclass(cnw_year_active50[match(thetimes$year, cnw_year_active50$year), "active50"])[[1]]
thetimes$cnw_active250 <- unclass(cnw_year_active250[match(thetimes$year, cnw_year_active250$year), "active250"])[[1]]



thetimes$cnw_gini20r <- replace(thetimes$cnw_gini20, thetimes$cnw_gini20 == 0, NA)
thetimes$cnw_gini50r <- replace(thetimes$cnw_gini50, thetimes$cnw_gini50 == 0, NA)
thetimes$cnw_gini250r <- replace(thetimes$cnw_gini250, thetimes$cnw_gini250 == 0, NA)

thetimes$cnw_rgini20r <- 1 - thetimes$cnw_gini20r
thetimes$cnw_rgini50r <- 1 - thetimes$cnw_gini50r
thetimes$cnw_rgini250r <- 1 - thetimes$cnw_gini250r

### df.topic_div erstellen

df.topic_div <- data.frame(expand.grid(year = seq(1785, 2020, 1), indicator = c("active", "gini"), categorization = c("topic250", "topic50", "topic20"), corpus = c("noncrisis", "crisis", "wave"), value = NA))

### df.topic_div WAVE

## Gini

df.topic_div[(df.topic_div$indicator == "gini" & df.topic_div$categorization == "topic20" & df.topic_div$corpus == "wave"), "value"] <- 1 - thetimes$cnw_gini20

df.topic_div[(df.topic_div$indicator == "gini" & df.topic_div$categorization == "topic50" & df.topic_div$corpus == "wave"), "value"] <- 1 - thetimes$cnw_gini50

df.topic_div[(df.topic_div$indicator == "gini" & df.topic_div$categorization == "topic250" & df.topic_div$corpus == "wave"), "value"] <- 1 - thetimes$cnw_gini250

## Active

df.topic_div[(df.topic_div$indicator == "active" & df.topic_div$categorization == "topic20" & df.topic_div$corpus == "wave"), "value"] <- thetimes$cnw_active20

df.topic_div[(df.topic_div$indicator == "active" & df.topic_div$categorization == "topic50" & df.topic_div$corpus == "wave"), "value"] <- thetimes$cnw_active50

df.topic_div[(df.topic_div$indicator == "active" & df.topic_div$categorization == "topic250" & df.topic_div$corpus == "wave"), "value"] <- thetimes$cnw_active250

### df.topic_div CRISIS

## Gini

df.topic_div[(df.topic_div$indicator == "gini" & df.topic_div$categorization == "topic20" & df.topic_div$corpus == "crisis"), "value"] <- 1 - thetimes$gini

df.topic_div[(df.topic_div$indicator == "gini" & df.topic_div$categorization == "topic50" & df.topic_div$corpus == "crisis"), "value"] <- 1 - thetimes$gini50

df.topic_div[(df.topic_div$indicator == "gini" & df.topic_div$categorization == "topic250" & df.topic_div$corpus == "crisis"), "value"] <- 1 - thetimes$gini250

## Active

df.topic_div[(df.topic_div$indicator == "active" & df.topic_div$categorization == "topic20" & df.topic_div$corpus == "crisis"), "value"] <- thetimes$active20

df.topic_div[(df.topic_div$indicator == "active" & df.topic_div$categorization == "topic50" & df.topic_div$corpus == "crisis"), "value"] <- thetimes$active50

df.topic_div[(df.topic_div$indicator == "active" & df.topic_div$categorization == "topic250" & df.topic_div$corpus == "crisis"), "value"] <- thetimes$active250

### df.topic_div NONCRISIS

## Gini

df.topic_div[(df.topic_div$year < 2015 & df.topic_div$indicator == "gini" & df.topic_div$categorization == "topic20" & df.topic_div$corpus == "noncrisis"), "value"] <- subset(df_routine.gini_L, name == "topic20")$value

df.topic_div[(df.topic_div$year < 2015 & df.topic_div$indicator == "gini" & df.topic_div$categorization == "topic50" & df.topic_div$corpus == "noncrisis"), "value"] <- subset(df_routine.gini_L, name == "topic50")$value

df.topic_div[(df.topic_div$year < 2015 & df.topic_div$indicator == "gini" & df.topic_div$categorization == "topic250" & df.topic_div$corpus == "noncrisis"), "value"] <- subset(df_routine.gini_L, name == "topic250")$value

## Active

df.topic_div[(df.topic_div$year < 2015 & df.topic_div$indicator == "active" & df.topic_div$categorization == "topic20" & df.topic_div$corpus == "noncrisis"), "value"] <- subset(df_routine.active_L, resolution == "21 topic areas")$active

df.topic_div[(df.topic_div$year < 2015 & df.topic_div$indicator == "active" & df.topic_div$categorization == "topic50" & df.topic_div$corpus == "noncrisis"), "value"] <- subset(df_routine.active_L, resolution == "163 topic complexes")$active

df.topic_div[(df.topic_div$year < 2015 & df.topic_div$indicator == "active" & df.topic_div$categorization == "topic250" & df.topic_div$corpus == "noncrisis"), "value"] <- subset(df_routine.active_L, resolution == "250 topics")$active


top_div_year <- ggplot(df.topic_div, aes(y = value, x = year, color = corpus, shape = corpus, fill = corpus)) +
  geom_point(size = 1) +
  geom_smooth(span = 0.15) +
  facet_grid(categorization ~ indicator) +
  theme_soft() +
  scale_fill_viridis_d(option = "inferno", begin = .1, end = .9, direction = -1) +
  scale_color_viridis_d(option = "inferno", begin = .1, end = .9, direction = -1)

ggsave(top_div_year, file = "top_div_year.svg", unit = "cm", width = 16, height = 8, dpi = 1200, scale = 1.5)


save(df.topic_div, file = "df.topic_div.RData")


decades <- data.frame(decade = seq(1780, 2020, 10), nw.gini20 = NA, nw.gini50 = NA, nw.gini250 = NA)
for (i in seq(1780, 2020, 10))
{
  decades[decades$decade == i, "nw.gini20"] <- gini(subset(nwd.to20, Group.1 == i)$value)
  decades[decades$decade == i, "nw.gini50"] <- gini(subset(nwd.to50, Group.1 == i)$value)
  decades[decades$decade == i, "nw.gini250"] <- gini(subset(nwd.to250, Group.1 == i)$value)
}

thetimes$nw.gini20 <- NA
thetimes$nw.gini50 <- NA
thetimes$nw.gini250 <- NA

for (i in 1785:2020)
{
  thetimes[thetimes$year == i, "nw.gini20"] <- gini(subset(nw.to20, Group.1 == i)$value)
  thetimes[thetimes$year == i, "nw.gini50"] <- gini(subset(nw.to50, Group.1 == i)$value)
  thetimes[thetimes$year == i, "nw.gini250"] <- gini(subset(nw.to250, Group.1 == i)$value)
}

thetimes$nw.gini20r <- replace(thetimes$nw.gini20, thetimes$nw.gini20 == 0, NA)
thetimes$nw.gini50r <- replace(thetimes$nw.gini50, thetimes$nw.gini50 == 0, NA)
thetimes$nw.gini250r <- replace(thetimes$nw.gini250, thetimes$nw.gini250 == 0, NA)

thetimes$nw.rgini20r <- 1 - thetimes$nw.gini20r
thetimes$nw.rgini50r <- 1 - thetimes$nw.gini50r
thetimes$nw.rgini250r <- 1 - thetimes$nw.gini250r




thetimes3c <- pivot_longer(decades, cols = c("nw.gini20", "nw.gini50", "nw.gini250"))
thetimes3c$Decade <- factor(thetimes3c$decade, ordered = TRUE)

ggplot(subset(thetimes3c, !(name == "nw.gini20r")), aes(y = value, x = year, color = name, fill = name, shape = name)) +
  geom_point() +
  geom_smooth()

gini_time_cnw <- ggplot(subset(thetimes3c, decade > 1790), aes(y = 1 - value, x = decade, color = name, fill = name, shape = name)) +
  geom_point() +
  geom_smooth() +
  ggtitle("Crisis news wave coverage (17'338)") +
  annotate("label", fill = viridis(3, begin = 0.1, end = 0.8)[c(1, 3, 2)], color = "white", x = c(1925, 1875, 1850), y = c(0.3, 0.167, 0.025), label = c("topic areas(20)", "topic complexes(163)", "topics(250)")) +
  theme_bluewhite() +
  scale_color_viridis_d(begin = 0.1, end = 0.8) +
  scale_fill_viridis_d(begin = 0.1, end = 0.8) +
  xlab("Decade") +
  ylab("1-Gini coefficient") +
  theme(legend.position = "none")

ggsave(gini_time_cnw, file = "gini_time_cnw.svg", device = "svg", unit = "cm", width = 16, height = 12, dpi = 1200, scale = 1.25)


##################################################
#### Create a data frame of the number of CRISIS NEWS WAVES
####

count_cnw_year <- ggplot(df.cnw.count, aes(x = year, y = cnw)) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Count of crisis news waves")

count_cnw_year2 <- ggplot(df.cnw.count, aes(x = year, y = cnw, label = year)) +
  geom_point(aes(size = cnw), alpha = .2, shape = 17) +
  geom_text(aes(size = cnw)) +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Count of crisis news waves") +
  scale_y_sqrt() +
  theme(legend.position = "none")


share_cnw_year <- ggplot(thetimes, aes(x = year, y = 100 * cnw.share)) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Share of CNW coverage") +
  scale_y_sqrt()

share_cl_year <- ggplot(thetimes, aes(x = year, y = 100 * cl.share)) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Share of CL coverage") +
  scale_y_sqrt()

intensity_cnw <- ggplot(subset(wd250x, volume < 100), aes(y = intensity2, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red", formula = y ~ x) +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Intensity of crisis news waves (articles per day)") +
  xlab("Year") +
  scale_y_sqrt()


volume_cnw <- ggplot(subset(wd250x, volume < 100), aes(y = volume2, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red", formula = y ~ x) +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Volume of crisis news waves") +
  xlab("Year") +
  scale_y_sqrt()

duration_cnw <- ggplot((wd250x), aes(y = duration, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Duration of crisis news waves") +
  xlab("Year") +
  scale_y_sqrt()

ggsave(count_cnw_year2, file = "count_cnw_year.svg", dpi = 1200, width = 16, height = 8, unit = "cm", scale = 1.5)
ggsave(share_cl_year, file = "share_cl_year.svg", dpi = 1200, width = 16, height = 8, unit = "cm", scale = 1.5)
ggsave(share_cnw_year, file = "share_cnw_year.svg", dpi = 1200, width = 16, height = 8, unit = "cm", scale = 1.5)
ggsave(volume_cnw, file = "volume_year.svg", dpi = 1200, width = 16, height = 8, unit = "cm", scale = 1.5)
ggsave(duration_cnw, file = "duration_cnw.svg", dpi = 1200, width = 16, height = 8, unit = "cm", scale = 1.5)
ggsave(intensity_cnw, file = "intensity_cnw.svg", dpi = 1200, width = 16, height = 8, unit = "cm", scale = 1.5)

search_for_K <- ggplot(subset(sk10t, variable %in% c("exclus", "semcoh", "heldout", "residual")), aes(x = K, y = value)) +
  geom_point() +
  geom_line() +
  geom_smooth(color = "darkslategray4") +
  geom_vline(xintercept = 250, color = "red") +
  facet_wrap(. ~ variable, scales = "free_y") +
  theme_bluewhite() +
  ylab("Value")

ggsave(search_for_K, file = "search_for_K.svg", dpi = 1200, scale = 1.5, unit = "cm", width = 16, height = 10)

lm.cl_share.year <- (lm(100 * cl.share ~ I(year - 1785) + 0, data = thetimes))
lm.cnw_share.year <- (lm(100 * cnw.share ~ I(year - 1785) + 0, data = thetimes))
lm.cnw_count.year <- (lm(cnw ~ I(year - 1785), data = thetimes))
lm.cnw_volume.year <- (lm((volume + duration * baseline30) ~ I(as.numeric(year) - 1785) + 0, data = wd250x))
lm.cnw_duration.year <- (lm(duration ~ I(as.numeric(year) - 1785), data = wd250x))
lm.cnw_intensity.year <- (lm(I(intensity + baseline30) ~ I(as.numeric(year) - 1785) + 0, data = wd250x))

ci.cl_share.year <- confint(lm.cl_share.year)
ci.cnw_share.year <- confint(lm.cnw_share.year)
ci.cnw_count.year <- confint(lm.cnw_count.year)
ci.cnw_volume.year <- confint(lm.cnw_volume.year)
ci.cnw_duration.year <- confint(lm.cnw_duration.year)
ci.cnw_intensity.year <- confint(lm.cnw_intensity.year)

pred.cl_share.year <- predict(lm.cl_share.year, newdata = data.frame(year = y25))
pred.cnw_share.year <- predict(lm.cnw_share.year, newdata = data.frame(year = y25))
pred.cnw_count.year <- predict(lm.cnw_count.year, newdata = data.frame(year = y25))
pred.cnw_volume.year <- predict(lm.cnw_volume.year, newdata = data.frame(year = y25))
pred.cnw_duration.year <- predict(lm.cnw_duration.year, newdata = data.frame(year = y25))
pred.cnw_intensity.year <- predict(lm.cnw_intensity.year, newdata = data.frame(year = y25))

names(pred.cl_share.year) <- y25
names(pred.cnw_share.year) <- y25
names(pred.cnw_count.year) <- y25
names(pred.cnw_volume.year) <- y25
names(pred.cnw_duration.year) <- y25
names(pred.cnw_intensity.year) <- y25

topic_spectrum_year <- ggplot(subset(wd250x, !is.na(topic20)), aes(x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), color = topic20_ord, fill = topic20_ord, ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 195)^(1 / 2) - 1)) +
  geom_rect() +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  scale_fill_viridis_d(begin = .1, end = .9) +
  scale_color_viridis_d(begin = .1, end = .9) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  scale_color_manual("Topic", values = rep(c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"), times = 2)) +
  scale_fill_manual("Topic", values = rep(c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"), times = 2)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank())

topic_spectrum_year + geom_smooth(data = thetimes, aes(x = year, y = cnw / 2, xmin = NULL, xmax = NULL, ymin = NULL, ymax = NULL), color = "black", fill = "black")


topic_spectrum_year + geom_smooth(data = thetimes, aes(x = year, y = 100 * cnw.count / max(cnw.count), xmin = NULL, xmax = NULL, ymin = NULL, ymax = NULL), color = "black", fill = "black")

ggsave(topic_spectrum_year, file = "topic_spectrum_year.svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.75)


EPI_topic_spectrum_year <- ggplot(subset(wd250x, topic20 == "EPI"), aes(color = topic20, fill = topic20, x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 50000)^(1 / 2) - 1)) +
  geom_rect(color = "black", alpha = .3) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(label = event_id, y = topic20_num + coord - 1), hjust = 1) +
  geom_text(aes(label = event_label, y = topic20_num + coord - 1), hjust = 0) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank()) +
  scale_color_manual(values = subset(wd250x, topic20 == "EPI")$color20) +
  scale_fill_manual(values = subset(wd250x, topic20 == "EPI")$color20) +
  scale_x_continuous(breaks = seq(1780, 2020, 20))

ggplotly(EPI_topic_spectrum_year)

POL_topic_spectrum_year <- ggplot(subset(wd250x, topic20 == "POL"), aes(color = topic20, fill = topic20, x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 50000)^(1 / 2) - 1)) +
  geom_rect(color = "black", alpha = .3) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(label = event_id, y = topic20_num + coord - 1), hjust = 1) +
  geom_text(aes(label = event_label, y = topic20_num + coord - 1), hjust = 0) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank()) +
  scale_color_manual(values = subset(wd250x, topic20 == "POL")$color20) +
  scale_fill_manual(values = subset(wd250x, topic20 == "POL")$color20) +
  scale_x_continuous(breaks = seq(1780, 2020, 20))

ggplotly(POL_topic_spectrum_year)

TRA_topic_spectrum_year <- ggplot(subset(wd250x, topic20 == "TRA"), aes(color = topic20, fill = topic20, x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 50000)^(1 / 2) - 1)) +
  geom_rect(color = "black", alpha = .3) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(label = event_id, y = topic20_num + coord - 1), hjust = 1) +
  geom_text(aes(label = event_label, y = topic20_num + coord - 1), hjust = 0) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank()) +
  scale_color_manual(values = subset(wd250x, topic20 == "TRA")$color20) +
  scale_fill_manual(values = subset(wd250x, topic20 == "TRA")$color20) +
  scale_x_continuous(breaks = seq(1780, 2020, 20))

ggplotly(TRA_topic_spectrum_year)


ECO_topic_spectrum_year <- ggplot(subset(wd250x, topic20 == "ECO"), aes(color = topic20, fill = topic20, x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 50000)^(1 / 2) - 1)) +
  geom_rect(color = "black", alpha = .3) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(label = paste0("\n\n", event_id), y = topic20_num + coord - 1), hjust = 0.5) +
  geom_text(aes(label = event_label, y = topic20_num + coord - 1), hjust = 0) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank()) +
  scale_color_manual(values = subset(wd250x, topic20 == "ECO")$color20) +
  scale_fill_manual(values = subset(wd250x, topic20 == "ECO")$color20)

ggplotly(ECO_topic_spectrum_year)






EPI_topic_spectrum_year <- ggplot(subset(wd250x, topic20 == "EPI"), aes(color = topic20, fill = topic20, x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 50000)^(1 / 2) - 1)) +
  geom_rect(color = "black", alpha = .3) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(label = event_id, y = topic20_num + coord - 1)) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank()) +
  scale_color_manual(values = subset(wd250x, topic20 == "EPI")$color20) +
  scale_fill_manual(values = subset(wd250x, topic20 == "EPI")$color20)

ECO_topic_spectrum_year <- ggplot(subset(wd250x, topic20 == "ECO"), aes(color = topic20, fill = topic20, x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 50000)^(1 / 2) - 1)) +
  geom_rect(color = "black", alpha = .3) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(label = event_id, y = topic20_num + coord - 1)) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank()) +
  scale_color_manual(values = subset(wd250x, topic20 == "ECO")$color20) +
  scale_fill_manual(values = subset(wd250x, topic20 == "ECO")$color20)



ECO_topic_spectrum_year <- ggplot(subset(wd250x, topic20 == "ECO"), aes(x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), color = topic20_ord, fill = topic20_ord, ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 50000)^(1 / 2) - 1)) +
  geom_rect(color = "black", alpha = .3) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  scale_fill_viridis_d(begin = .1, end = .9) +
  scale_color_viridis_d(begin = .1, end = .9) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  scale_color_manual("Topic", values = rep(c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"), times = 2)) +
  scale_fill_manual("Topic", values = rep(c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"), times = 2)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank())

POL_topic_spectrum_year <- ggplot(subset(wd250x, topic20 == "POL"), aes(x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), color = topic20_ord, fill = topic20_ord, ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 50000)^(1 / 2) - 1)) +
  geom_rect(color = "black", alpha = .3) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  scale_fill_viridis_d(begin = .1, end = .9) +
  scale_color_viridis_d(begin = .1, end = .9) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  scale_color_manual("Topic", values = rep(c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"), times = 2)) +
  scale_fill_manual("Topic", values = rep(c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"), times = 2)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank())

DIS_topic_spectrum_year <- ggplot(subset(wd250x, topic20 == "DIS"), aes(x = as.numeric(year), xmin = as.numeric(1785 + (start / 365)), xmax = as.numeric(1785 + 2.5 * ((end - start) / 365) + start / 365), color = topic20_ord, fill = topic20_ord, ymin = topic20_num + coord - 1, ymax = topic20_num + coord + (volume / 50000)^(1 / 2) - 1)) +
  geom_rect(color = "black", alpha = .3) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  geom_text(aes(x = 2025, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  geom_text(aes(x = 1790, y = topic20_num - 0.5, label = topic20_ord), hjust = 0) +
  scale_fill_viridis_d(begin = .1, end = .9) +
  scale_color_viridis_d(begin = .1, end = .9) +
  scale_y_continuous(breaks = seq(0, 20, 5), minor_breaks = seq(0, 20, 1)) +
  scale_color_manual("Topic", values = rep(c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"), times = 2)) +
  scale_fill_manual("Topic", values = rep(c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"), times = 2)) +
  ylab("Topics") +
  xlab("Year") +
  theme(axis.text.y = element_blank())







eco.cnw.count <- data.frame(table(wd250x$year, wd250x$topic20 == "ECO"))
df.eco.cnw.count <- data.frame(year = c(1785:2020, 1785:2020), topic = rep(c("ECO", "non-ECO"), each = 236), cnw.count = 0)

df.eco.cnw.count$cnw.count[1:236] <- subset(eco.cnw.count, Var2 == TRUE)[match(subset(df.eco.cnw.count, topic == "ECO")$year, subset(eco.cnw.count, Var2 == TRUE)$Var1), "Freq"]

df.eco.cnw.count$cnw.count[237:472] <- subset(eco.cnw.count, Var2 == FALSE)[match(subset(df.eco.cnw.count, topic == "non-ECO")$year, subset(eco.cnw.count, Var2 == FALSE)$Var1), "Freq"]

ggplot(df.eco.cnw.count, aes(y = cnw.count, x = year, color = topic, fill = topic, shape = topic)) +
  geom_point() +
  geom_smooth()




##################################################
#### Volume of CRISIS NEWS WAVES
####

ggplot(subset(wd250x, volume < 100), aes(y = volume, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red", formula = y ~ x) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Volume of crisis news waves")

summary(lm(volume ~ as.numeric(year), data = wd250x))
### B=.044 (.004); t=12.34; p<.001

ggplot(subset(wd250x, volume < 100), aes(y = volume, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  facet_wrap(. ~ topic20) +
  ylim(0, 10)

ggplot(subset(wd250x, volume < 100), aes(y = volume, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  facet_wrap(. ~ topic20 == "ECO") +
  ylim(0, 10)

ggplot((wd250x), aes(y = volume, x = as.numeric(year), shape = (topic20 == "ECO"), color = (topic20 == "ECO"))) +
  geom_point() +
  geom_smooth() +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylim(0, 10)


##################################################
#### Duration of CRISIS NEWS WAVES
####

ggplot((wd250x), aes(y = duration, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Duration of crisis news waves")

summary(lm(duration ~ as.numeric(year), data = wd250x))
### B=.840 (.042); t=19.83; p<.001

ggplot((wd250x), aes(y = duration, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  facet_wrap(. ~ topic20) +
  ylim(0, 500)

ggplot((wd250x), aes(y = duration, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  facet_wrap(. ~ topic20 == "ECO") +
  ylim(0, 500)

ggplot((wd250x), aes(y = duration, x = as.numeric(year), shape = (topic20 == "ECO"), color = (topic20 == "ECO"))) +
  geom_point() +
  geom_smooth() +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylim(0, 500)

##################################################
#### Intensity of CRISIS NEWS WAVES
####

ggplot(subset(wd250x, intensity < 0.4), aes(y = intensity, x = as.numeric(year))) +
  geom_point() +
  geom_smooth(color = "darkslategray4", fill = "darkslategray4", linetype = "solid") +
  geom_smooth(method = "lm", color = "red", fill = "red") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 20))

summary(lm(intensity ~ as.numeric(year), data = wd250x))
### B=.00014487 (.00001336); t=10.846; p<.001




## News volume

newsvolume <- data.frame(year = thetimes$year, articles = thetimes$articles)

editors <- data.frame(
  start = c(1785, 1803, 1812, 1817, 1841, 1877, 1884, 1912, 1919, 1919, 1923, 1941, 1948, 1952, 1967, 1981, 1982, 1985, 1990, 1992, 2002, 2007, 2013),
  end = c(1803, 1812, 1817, 1841, 1877, 1884, 1912, 1919, 1919, 1923, 1941, 1948, 1952, 1967, 1981, 1982, 1985, 1990, 1992, 2002, 2007, 2013, 2020),
  editor = c("Walter I", "Walter II", "Stoddard", "Barnes", "Delane", "Chenery", "Buckle", "Dawson", "Freeman", "Steed", "Dawson", "Barrington-Ward", "Casey", "Haley", "Rees-Mogg", "Evans", "Douglas-Home", "Wilson", "Jenkins", "Stothard", "Thomson", "Harding", "Witherow")
)

owners <- data.frame(
  start = c(1785, 1803, 1847, 1894, 1908, 1922, 1959, 1966, 1976, 1981),
  end = c(1803, 1847, 1894, 1908, 1922, 1959, 1966, 1976, 1981, 2020),
  owner = c("Walter I", "Walter II", "Walter III", "Walter IV", "Harmsworth", "Astor I", "Astor II", "Thomson I", "Thomson II", "Murdoch")
)

events <- data.frame(
  start = c(1803, 1914, 1939, 1935, 1887, 1920, 1814, 1844, 1838, 1848, 1860, 1978),
  end = c(1815, 1918, 1945, 1938, 1888, 1921, 1815, 1845, 1839, 1870, 1866, 1979),
  event = c("Napoleonic Wars", "World War I", "World War II", "Appeasement", "Piggott forgeries", "Zion hoax", "Hi-speed steam press", "Rotary press", "London-Birmingham Postal Railway", "Telegraph network established", "'Walter Press'", "Strike")
)


prices <- data.frame(
  year = c(1788, 1797, 1805, 1814, 1824, 1838, 1847, 1857, 1865, 1873, 1882, 1890, 1907, 1921, 1931, 1939, 1950, 1973, 1980, 1990, 2000, 2007, 2014, 2020),
  price = c(3 / 240, 6 / 240, 6 / 240, 6.5 / 240, 7 / 240, 5 / 240, 5 / 240, 4 / 240, 3 / 240, 3 / 240, 3 / 240, 3 / 240, 3 / 240, 5 / 240, 4 / 240, 2.5 / 240, 3 / 240, 3 / 100, 20 / 100, 35 / 100, 35 / 100, 65 / 100, 120 / 100, 220 / 100)
)

spend$GDP_deflator_i <- as.numeric(c(rep(1.41, times = 11), spend$GDP.Deflator[12:230], rep(106.00, times = 2)))

prices$realprice <- 100 * prices$price / spend[match(prices$year, spend$Year), "GDP_deflator_i"]

circulation <- data.frame(
  year = c(1815, 1852, 1910, 1921, 1930, 1939, 1947, 1956, 1966, 1976, 1980, 1992, 2000, 2005, 2010, 2015, 2019),
  circulation = c(5000, 42384, 45000, 113000, 187000, 204000, 268000, 220716, 282000, 310000, 297000, 386258, 726349, 686327, 508250, 396621, 417298)
)

senseless.topics <- aggregate(long.STM$pr, by = list(long.STM$year, is.na(long.STM$area)), FUN = "sum")
senseless.topics$total <- rep(tapply(senseless.topics[, "x"], senseless.topics$Group.1, "sum"), times = 2)
senseless.topics$share <- senseless.topics$x / senseless.topics$total

gg.senseless.topics <- ggplot(senseless.topics, aes(y = 100 * share, x = as.numeric(Group.1), fill = Group.2)) +
  geom_area(position = "stack") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  scale_color_viridis_d() +
  scale_fill_viridis_d(begin = .2, end = .8, labels = c("Interpretable", "Uninterpretable")) +
  xlab("Year") +
  ylab("Share of non-interpretable issues") +
  guides(fill = guide_legend("Interpretability\nof topic")) +
  theme(legend.position = c(0.7, 0.7))

ggplot(subset(senseless.topics, Group.2 == TRUE), aes(y = 100 * share, x = as.numeric(Group.1))) +
  geom_point() +
  geom_smooth() +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10))

ggsave(gg.senseless.topics, file = "senseless.svg", unit = "cm", width = 16, height = 8, scale = 1.25)


gg.threshold <- ggplot(newsvolume, aes(y = threshold2, x = year)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_point(color = "#7d3c98") +
  geom_line(color = "#7d3c98") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  scale_color_viridis_d() +
  scale_fill_viridis_d(begin = .2, end = .8) +
  xlab("Year") +
  ylab("Thresdhold of volume for\ndetection of crisis events") +
  guides(fill = guide_legend("Minimum wave volume"))


ggsave(gg.threshold, file = "threshold.svg", unit = "cm", width = 16, height = 8, scale = 1.25)



coverage.volume.trajectory <- ggplot() +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue", size = 1) +
  geom_point(data = newsvolume, aes(y = articles, x = year), color = "black") +
  geom_smooth(data = newsvolume, aes(y = articles, x = year), color = "dodgerblue") +
  geom_point(data = circulation, aes(y = circulation / 7, x = year), shape = 15, size = 5, color = "#888888") +
  geom_point(data = prices, aes(y = realprice * 50000, x = year), shape = 16, size = 5, color = "hotpink") +
  geom_text(data = circulation, aes(y = circulation / 7, x = year + 2, label = circulation), color = "#888888", hjust = 0) +
  geom_rect(data = editors, aes(ymin = 1000, ymax = 3000, xmin = start, xmax = end, fill = editor), color = "white") +
  geom_text(data = editors, aes(y = c(rep(c(-1000, -7000, -4000, -10000), times = 5), -7000, -10000, -4000), x = start, label = editor), hjust = 0) +
  geom_rect(data = owners, aes(ymin = 140000, ymax = 142000, xmin = start, xmax = end, fill = owner), color = "white") +
  geom_text(data = owners, aes(y = c(rep(c(138000, 132000, 135000, 129000), times = 2), 138000, 132000), x = start, label = owner), hjust = 0) +
  geom_rect(data = events, aes(ymin = 120000, ymax = 125000, xmin = start, xmax = end, fill = event), color = "white") +
  geom_text(data = events, aes(y = c(118000, 118000, 118000, 115000, 115000, 115000, 112000, 118000, 109000, 106000, 112000, 112000), x = start, label = event), hjust = 0) +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  theme_soft() +
  theme(legend.position = "none")


coverage.volume.trajectory

coverage.volume.trajectory <- ggplot() +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue", size = 1) +
  geom_point(data = newsvolume, aes(y = articles, x = year), color = "black") +
  geom_smooth(data = newsvolume, aes(y = articles, x = year), color = "dodgerblue") +
  geom_point(data = circulation, aes(y = circulation / 7, x = year), shape = 15, size = 4, color = "#888888") +
  geom_point(data = prices, aes(y = realprice * 50000, x = year), shape = 16, size = 4, color = "hotpink") +
  geom_text(data = prices, aes(y = realprice * 50000, x = year, label = round(realprice, 2)), size = 4, color = "hotpink", nudge_y = 4000) +
  geom_text(data = circulation, aes(y = circulation / 7, x = year + 2, label = circulation), color = "#888888", hjust = 0) +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  theme_bluewhite() +
  theme(legend.position = "none") +
  annotate("text", size = 5, x = c(1850, 1930, 1825), y = c(25000, 0, 120000), color = c("black", "darkgrey", "hotpink"), label = c("total stories published", "total circulation", "price (inflation adjusted)"), fontface = "bold")
coverage.volume.trajectory


ggsave(coverage.volume.trajectory, file = "cov_vol_traj.svg", unit = "cm", width = 16, height = 12, scale = 1.5)


backgrounds <- ggplot() +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue", size = 1) +
  geom_rect(data = editors, aes(ymin = 1000, ymax = 3000, xmin = start, xmax = end, fill = editor), color = "white") +
  geom_text(data = editors, aes(y = c(rep(c(-1000, -7000, -4000, -10000), times = 5), -7000, -10000, -4000), x = start, label = editor), hjust = 0) +
  geom_rect(data = owners, aes(ymin = 40000, ymax = 42000, xmin = start, xmax = end, fill = owner), color = "white") +
  geom_text(data = owners, aes(y = c(rep(c(38000, 32000, 35000, 29000), times = 2), 38000, 32000), x = start, label = owner), hjust = 0) +
  geom_rect(data = events, aes(ymin = 20000, ymax = 25000, xmin = start, xmax = end, fill = event), color = "white") +
  geom_text(data = events, aes(y = c(18000, 18000, 18000, 15000, 15000, 15000, 12000, 18000, 9000, 6000, 12000, 12000), x = start, label = event), hjust = 0) +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  theme_soft() +
  theme(legend.position = "none", axis.text.y = element_text(color = "lightskyblue1")) +
  ylab("Events") +
  annotate("text", fontface = "bold", x = c(3000))
backgrounds

cov.vol.traj <- ggarrange(coverage.volume.trajectory, backgrounds, heights = c(2, 1))

ggsave(cov.vol.traj, file = "cov_vol_traj.svg", unit = "cm", width = 16, height = 8, scale = 3)

uniq_count <- function(x) {
  return(length(unique(x)))
}

cl <- data.frame(table(wide.STM$year))

cl_time <- ggplot(cl, aes(x = as.numeric(as.character(Var1)), y = Freq)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.5) +
  geom_point() +
  geom_smooth(span = 0.5, color = "steelblue") +
  theme_bluewhite() +
  xlab("Year") +
  ylab("Count of news articles with CL") +
  scale_x_continuous(breaks = seq(1780, 2020, 10))

ggsave(cl_time, file = "cl_time.svg", device = "svg", unit = "cm", width = 16, height = 8, dpi = 1200, scale = 1.25)


eventcount <- aggregate(wavedata.vol50$TOPIC, by = list(wavedata.vol50$year), FUN = length)
unique_eventcount <- aggregate(wavedata.vol50$TOPIC, by = list(wavedata.vol50$year), FUN = uniq_count)
names(unique_eventcount) <- c("year", "UniqueTopicsCovered")

eventcount_decade <- aggregate(wavedata.vol50$TOPIC, by = list(wavedata.vol50$decade), FUN = length)
unique_eventcount_decade <- aggregate(wavedata.vol50$TOPIC, by = list(wavedata.vol50$decade), FUN = uniq_count)
names(unique_eventcount_decade) <- c("decade", "UniqueTopicsCovered")

eventcount_decade_area <- aggregate(wavedata.vol50$AREA, by = list(wavedata.vol50$decade), FUN = length)
unique_eventcount_decade_area <- aggregate(wavedata.vol50$AREA, by = list(wavedata.vol50$decade), FUN = uniq_count)
names(unique_eventcount_decade_area) <- c("decade", "UniqueAreasCovered")

eventcount_decade_area2 <- aggregate(wavedata.vol50$AREA2, by = list(wavedata.vol50$decade), FUN = length)
unique_eventcount_decade_area2 <- aggregate(wavedata.vol50$AREA2, by = list(wavedata.vol50$decade), FUN = uniq_count)
names(unique_eventcount_decade_area2) <- c("decade", "UniqueAreasCovered")




decade.STM_area2b <- subset(decade.STM_area2, !area == "Epidemics" & !area == "Location")
decade.STM_area2b[decade.STM_area2b$area == "Epidemic", "articles"] <- decade.STM_area2[decade.STM_area2$area == "Epidemic", "articles"] + decade.STM_area2[decade.STM_area2$area == "Epidemics", "articles"]

decade.STM_area2b[decade.STM_area2b$area == "Geopolitical", "articles"] <- decade.STM_area2[decade.STM_area2$area == "Geopolitical", "articles"] + decade.STM_area2[decade.STM_area2$area == "Location", "articles"]


decade.STM_area2b$total <- tapply(decade.STM_area2b$articles, decade.STM_area2b$decade, sum, na.rm = TRUE)
decade.STM_area2b$share <- decade.STM_area2b$articles / decade.STM_area2b$total

year.STM <- aggregate(long.STM$pr, by = list(long.STM$year, long.STM$area2), FUN = "sum", na.rm = TRUE)
names(year.STM) <- c("year", "area2", "count")
year.STM$total <- newsvolume$articles[match(year.STM$year, newsvolume$year)]
year.STM$total <- tapply(year.STM$count, year.STM$year, "sum", na.rm = TRUE)
year.STM$share <- year.STM$count / year.STM$total

year.STM2 <- subset(year.STM, !area2 == "Epidemics" & !area2 == "Location")

year.STM2[year.STM2$area2 == "Epidemic", "share"] <- year.STM[year.STM$area2 == "Epidemic", "share"] + year.STM[year.STM$area2 == "Epidemics", "share"]
year.STM2[year.STM2$area2 == "Epidemic", "count"] <- year.STM[year.STM$area2 == "Epidemic", "count"] + year.STM[year.STM$area2 == "Epidemics", "count"]

year.STM2[year.STM2$area2 == "Geopolitical", "share"] <- year.STM[year.STM$area2 == "Geopolitical", "share"] + year.STM[year.STM$area2 == "Location", "share"]
year.STM2[year.STM2$area2 == "Geopolitical", "count"] <- year.STM[year.STM$area2 == "Geopolitical", "count"] + year.STM[year.STM$area2 == "Location", "count"]

share_CL_traj <- ggplot(year.STM2, aes(y = 100 * share, x = as.numeric(year), color = area2, fill = area2)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_hline(yintercept = 0, color = "red") +
  geom_smooth() +
  geom_point() +
  facet_wrap(~area2, scales = "free_y", ncol = 3) +
  theme_bluewhite() +
  scale_color_viridis_d() +
  scale_fill_viridis_d() +
  theme(legend.position = "none") +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Share of crisis labelling") +
  xlab("Year")

share_CL_traj_share <- ggplot(decade.STM_area2b, aes(y = 100 * share, x = as.numeric(decade), fill = area)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_area(color = "white") +
  theme_bluewhite() +
  scale_color_viridis_d() +
  scale_fill_viridis_d() +
  theme(legend.position = "bottom") +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Share of crisis labelling") +
  xlab("Year")

share_CL_traj_abs <- ggplot(decade.STM_area2b, aes(y = articles, x = as.numeric(decade), fill = area)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_area() +
  theme_bluewhite() +
  scale_color_viridis_d() +
  scale_fill_viridis_d() +
  theme(legend.position = "bottom") +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Number of articles with crisis labelling") +
  xlab("Year")

share_CL_traj_share <- ggplot(decade.STM_area2b, aes(y = 100 * share, x = as.numeric(decade), fill = area)) +
  geom_area(color = "white", show.legend = FALSE) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  theme_bluewhite() +
  scale_color_viridis_d() +
  scale_fill_viridis_d() +
  theme(legend.position = "bottom") +
  scale_x_continuous(breaks = seq(1780, 2020, 20)) +
  ylab("Share of crisis labelling") +
  xlab("Year") +
  annotate("text",
    x =
      c(1900, 1980, 2008, 1973, 1850, 2000, 1800, 1900, 1830, 2010, 1820, 2000, 2000, 1805, 2020, 2000, 1870, 2015, 1860, 2015), y =
      c(95, 85, 65, 63, 75, 57, 75, 50, 30, 22, 17, 17, 12, 8, 8.5, 5.5, 6, 3, 2.5, 1), label =
      c("Disaster", "Economic", "Education", "Energy", "Epidemic", "Family", "Functional", "Geopolitical", "Government", "Health", "Justice", "Labor", "Leisure", "Military", "Public", "Science", "Society", "Technology", "Transport", "Welfare"), color = c(rep(c("white", "black"), each = 10))
  )


ggsave(share_CL_traj, file = "share_CL_traj.svg", units = "cm", width = 16, height = 20, dpi = 1200, scale = 2)
ggsave(share_CL_traj_share, file = "share_CL_traj_share.svg", units = "cm", width = 16, height = 10, dpi = 1200, scale = 1.75)
ggsave(share_CL_traj_abs, file = "share_CL_traj_abs.svg", units = "cm", width = 16, height = 12, dpi = 1200, scale = 2)


gg_topicspectrum.event.decades <- ggplot(unique_eventcount_decade, aes(x = decade, y = UniqueTopicsCovered)) +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue", size = 1) +
  geom_point(size = 2.5) +
  geom_smooth(method = "lm", fill = "#44aa66", color = "#44aa66") +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77", linetype = "longdash") +
  geom_hline(yintercept = 106, color = "red") +
  theme_bluewhite() +
  ylim(0, 106) +
  ylab("Topics covered by crisis events in the decade") +
  xlab("Decade") +
  scale_x_continuous(breaks = seq(1780, 2020, 10))

gg_areaspectrum.event.decades <- ggplot(unique_eventcount_decade_area, aes(x = decade, y = UniqueAreasCovered)) +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue", size = 1) +
  geom_point(size = 2.5) +
  geom_smooth(method = "lm", fill = "#44aa66", color = "#44aa66") +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77", linetype = "longdash") +
  geom_hline(yintercept = 22, color = "red") +
  theme_bluewhite() +
  ylim(0, 22) +
  ylab("Areas covered by crisis events in the decade") +
  xlab("Decade") +
  scale_x_continuous(breaks = seq(1780, 2020, 10))

gg_areaspectrum2.event.decades <- ggplot(unique_eventcount_decade_area2, aes(x = decade, y = UniqueAreasCovered)) +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue", size = 1) +
  geom_point(size = 2.5) +
  geom_smooth(method = "lm", fill = "#44aa66", color = "#44aa66") +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77", linetype = "longdash") +
  geom_hline(yintercept = 48, color = "red") +
  theme_bluewhite() +
  ylim(0, 48) +
  ylab("Areas covered by crisis events in the decade") +
  xlab("Decade") +
  scale_x_continuous(breaks = seq(1780, 2020, 10))


ggsave(gg_topicspectrum.event.decades, file = "topicspectrum_event_decade.svg", units = "cm", width = 16, height = 8, dpi = 1200, scale = 1.5)
ggsave(gg_areaspectrum.event.decades, file = "areaspectrum_event_decade.svg", units = "cm", width = 16, height = 8, dpi = 1200, scale = 1.5)
ggsave(gg_areaspectrum2.event.decades, file = "area2spectrum_event_decade.svg", units = "cm", width = 16, height = 8, dpi = 1200, scale = 1.5)


crisisarticles <- aggregate(wavedata.vol50$volume, by = list(wavedata.vol50$year), FUN = sum)


crisisarticles$year <- as.numeric(crisisarticles$Group.1)
crisisarticles$volume <- crisisarticles$x
crisisarticles$CNW <- crisisarticles$volume
crisisarticles$CL <- crisisarticles$volume
crisisarticles$Coverage <- crisisarticles$volume

crisis.esc.stages <- data.frame(year = seq(1788, 2020, 1))
crisis.esc.stages$CNW <- crisisarticles$CNW[match(crisis.esc.stages$year, crisisarticles$year)]
crisis.esc.stages$CL <- newsvolume$crisis[match(crisis.esc.stages$year, newsvolume$year)]
crisis.esc.stages$Coverage <- newsvolume$articles[match(crisis.esc.stages$year, newsvolume$year)]

crisis.esc.stages$CNWtoCoverage <- crisis.esc.stages$CNW / crisis.esc.stages$Coverage
crisis.esc.stages$CLtoCoverage <- crisis.esc.stages$CL / crisis.esc.stages$Coverage
crisis.esc.stages$CNWtoCL <- crisis.esc.stages$CNW / crisis.esc.stages$CL

crisis.esc.stages$Coverage100 <- crisis.esc.stages$Coverage / max(crisis.esc.stages$Coverage)
crisis.esc.stages$CL100 <- crisis.esc.stages$CL / max(crisis.esc.stages$CL)
crisis.esc.stages$CNW100 <- crisis.esc.stages$CNW / max(crisis.esc.stages$CNW, na.rm = TRUE)

ggplot(crisis.esc.stages, aes(x = year)) +
  geom_point(aes(y = Coverage100), color = "blue") +
  geom_point(aes(y = CL100), color = "green") +
  geom_point(aes(y = CNW100), color = "red")

CL.time <- ggplot(crisis.esc.stages, aes(x = year, y = 100 * CLtoCoverage)) +
  geom_vline(xintercept = fifty, color = "lightsteelblue", size = 1) +
  geom_point() +
  geom_smooth(span = 0.25, color = "steelblue", fill = "steelblue") +
  theme_bluewhite() +
  ylim(0, 5) +
  ylab("Share of coverage with crisis labelling (CL)") +
  xlab("Year") +
  scale_x_continuous(breaks = seq(1780, 2020, 10))

CL.time.small <- ggplot(crisis.esc.stages, aes(x = year, y = 100 * CLtoCoverage)) +
  geom_vline(xintercept = fifty, color = "lightsteelblue", size = 1) +
  geom_point() +
  geom_smooth(span = 0.25, color = "steelblue", fill = "steelblue") +
  theme_bluewhite() +
  ylim(0, 5) +
  ylab("Share of news stories \n with crisis labelling \n(% of total news coverage)") +
  xlab("Year") +
  scale_x_continuous(breaks = seq(1800, 2000, 50))

ggsave(CL.time.small, file = "CL_time_small.svg", device = "svg", dpi = 1200, unit = "cm", width = 4.4, height = 2.2, scale = 3.00)

ggsave(CL.time, file = "CL_time.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)



CNW.time <- ggplot(crisis.esc.stages, aes(x = year, y = 100 * CNWtoCoverage)) +
  geom_vline(xintercept = fifty, color = "lightsteelblue", size = 1) +
  geom_point() +
  geom_smooth(span = 0.25, color = "steelblue", fill = "steelblue") +
  theme_bluewhite() +
  ylab("Share of coverage that is part of crisis news waves (CNWs)") +
  xlab("Year") +
  scale_x_continuous(breaks = seq(1780, 2020, 10))

ggsave(CNW.time, file = "CNW_time.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)


crisisarticles.time <- ggplot(crisisarticles, aes(y = volume, x = year, label = year, size = (volume))) +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue", size = 1) +
  geom_point(color = "gray", shape = 17) +
  geom_text(color = "black") +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77", na.rm = TRUE) +
  xlab("Year") +
  ylab("Number of articles assigned to crisis events per year") +
  theme_bluewhite() +
  theme(legend.position = "none")
crisisarticles.time <- ggplot(crisisarticles, aes(y = volume, x = year, label = year, size = (volume))) +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue3", size = 1) +
  geom_point(color = "gray", shape = 17) +
  geom_text(color = "black") +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77", na.rm = TRUE) +
  xlab("Year") +
  ylab("Number of articles assigned to crisis events per year") +
  theme_bluewhite() +
  theme(legend.position = "none") +
  scale_y_continuous(trans = "log10") +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  annotation_logticks()



crisisarticles.lm <- lm(volume ~ year, data = crisisarticles)
predict(crisisarticles.lm, newdata = list(year = c(1800, 1850, 1900, 1950, 2000, 2050)))

ggsave(crisisarticles.time, file = "crisisarticles+time.svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.5)





eventcount <- aggregate(wavedata.vol50$duration / wavedata.vol50$duration, by = list(wavedata.vol50$year), FUN = sum)



eventcount$year <- as.numeric(eventcount$Group.1)
eventcount$count <- eventcount$x

eventcount.time <- ggplot(eventcount, aes(y = count, x = year, label = year, size = count)) +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue3", size = 1) +
  geom_point(shape = 17, color = "gray") +
  geom_text(color = "black") +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77", na.rm = TRUE, span = .25) +
  xlab("Year") +
  ylab("Number of crisis events identified") +
  theme_bluewhite() +
  theme(legend.position = "none") +
  scale_x_continuous(breaks = seq(1780, 2020, 10))

eventcount.time.small <- ggplot(eventcount, aes(y = count, x = year, label = year, size = count)) +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "lightblue3", size = 1) +
  geom_point(shape = 17, color = "gray") +
  geom_text(color = "black") +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77", na.rm = TRUE, span = .25) +
  xlab("Year") +
  ylab("Number of crisis \n news waves identified") +
  theme_bluewhite() +
  theme(legend.position = "none") +
  scale_x_continuous(breaks = seq(1800, 2000, 50))

ggsave(eventcount.time.small, file = "eventcount_time_small.svg", device = "svg", dpi = 1200, unit = "cm", width = 4.4, height = 2.2, scale = 3.00)

wavedata.vol50 <- wd250x
wavedata.vol50$YEAR <- as.numeric(wavedata.vol50$year)
wavedata.vol50$AREA_i <- to.lab[match(wavedata.vol50$topic250, to.lab$topic250), "topic20"]

areas.by.year <- tapply(wavedata.vol50$YEAR, wavedata.vol50$AREA_i, min, na.rm = TRUE)

wavedata.vol50$AREA_f <- factor(wavedata.vol50$AREA_i, levels = names(areas.by.year)[order(areas.by.year)], ordered = TRUE)

inf <- c(plasma(n = 10, alpha = 1, begin = 0.1, end = 0.9, direction = -1))
infr <- c(viridis(n = 10, alpha = 1, begin = 0.1, end = 0.9, direction = 1))
infscale <- c(inf[1], infr[1], inf[2], infr[2], inf[3], infr[3], inf[4], infr[4], inf[5], infr[5], inf[6], infr[6], inf[7], infr[7], inf[8], infr[8], inf[9], infr[9], inf[10])

eventscope <- ggplot(subset(wavedata.vol50, !is.na(AREA_i)), aes(x = YEAR, y = AREA_f, group = AREA_i, color = AREA_i, fill = AREA_i, size = volume, label = YEAR)) +
  geom_vline(xintercept = fifty, color = "#a99171", size = 1.5) +
  geom_jitter(shape = 15, alpha = .8, width = 0, height = 0.25, show.legend = FALSE) +
  theme_soft() +
  ylab("Topic area") +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  scale_color_manual(values = infscale)

ggsave(eventscope, file = "eventscope.svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.5)

ggplot(subset(wd250x, topic20 == "ENV"), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))

ggplot(subset(wd250x, topic20 == "NRG"), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))

ggplot(subset(wd250x, topic20 == "EPI"), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))

ggplot(subset(wd250x, topic20 == "DIS"), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))

ggplot(subset(wd250x, topic20 == "TRA"), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))

ggplot(subset(wd250x, topic20 == "ECO"), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))

ggplot(subset(wd250x, topic20 == "HEA"), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))

ggplot(subset(wd250x, topic20 %in% c("SCI", "EDU")), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))

ggplot(subset(wd250x, topic20 %in% c("GEO")), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))

ggplot(subset(wd250x, topic20 %in% c("LEI")), aes(x = as.numeric(year), y = 1, label = year, size = log(volume))) +
  geom_text(position = position_jitter(0.1)) +
  theme_soft() +
  scale_x_continuous(limits = c(1780, 2020), breaks = seq(1800, 2000, 50), minor_breaks = seq(1780, 2020, 10))


eventscope.small <- ggplot(subset(wavedata.vol50, !is.na(AREA_i) & !(AREA_i %in% c("Functional", "Family", "Society", "Education", "ScienceTech", "Public"))), aes(x = YEAR, y = AREA_f, group = AREA_i, color = AREA_i, fill = AREA_i, size = volume, label = YEAR)) +
  geom_vline(xintercept = fifty, color = "lightskyblue2", size = 1.5) +
  geom_jitter(width = 0, height = 0.25, show.legend = FALSE) +
  theme_bluewhite() +
  ylab("Topic area") +
  scale_x_continuous(breaks = seq(1800, 2000, 50))

ggsave(eventscope.small, file = "eventscope_small.svg", dpi = 1200, unit = "cm", width = 4.2, height = 2.1, scale = 3)



eventcount.lm <- lm(count ~ year, data = eventcount)
predict(eventcount.lm, newdata = list(year = c(1800, 1850, 1900, 1950, 2000, 2050)))

summary(lm(x ~ as.numeric(Group.1), data = x))

ggsave(eventcount.time, file = "eventcount+time.svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.5)

firstyear <- tapply(wavedata.vol50$YEAR, wavedata.vol50$topic, min, na.rm = T)
topictrafo <- firstyear[order(firstyear)]
topic.trafo <- data.frame(original = names(topictrafo), modified = 1:120)


wavedata.vol50$topic.trafo <- factor(topic.trafo[match(wavedata.vol50$topic, topic.trafo$original), "modified"])

datebreaks <- as.POSIXct(c(
  "1780-01-01", "1790-01-01", "1800-01-01", "1810-01-01", "1820-01-01", "1830-01-01", "1840-01-01", "1850-01-01",
  "1860-01-01", "1870-01-01", "1880-01-01", "1890-01-01", "1900-01-01",
  "1910-01-01", "1920-01-01", "1930-01-01", "1940-01-01", "1950-01-01",
  "1960-01-01", "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
  "2010-01-01", "2020-01-01"
))

fifty2 <- as.POSIXct(c("1800-01-01", "1850-01-01", "1900-01-01", "1950-01-01", "2000-01-01"))

crisisplot <- ggplot(wavedata.vol50, aes(color = topic.trafo, fill = topic.trafo, xmin = as.POSIXct(start * 60 * 60 * 24, origin = "0000-01-01"), xmax = as.POSIXct((start + duration * 10) * 60 * 60 * 24, origin = "0000-01-01"), ymin = as.numeric(topic.trafo), ymax = as.numeric(topic.trafo) + 2 * intensity)) +
  geom_vline(xintercept = fifty2, size = 1.15, color = "lightblue3") +
  geom_rect(fill = "aliceblue", color = "skyblue3", size = 1.5, aes(ymin = 0, ymax = 110, xmin = as.POSIXct("1788-01-01"), xmax = as.POSIXct("2022-12-31"))) +
  geom_rect() +
  scale_fill_viridis_d(na.value = "grey80") +
  scale_color_viridis_d(na.value = "grey80") +
  theme_bluewhite() +
  ylim(0, 110) +
  ylab("Topic ID") +
  xlab("Year") +
  theme(legend.position = "none") +
  scale_x_continuous(breaks = datebreaks, labels = seq(1780, 2020, 10)) +
  scale_y_continuous(breaks = seq(10, 100, 10), minor_breaks = seq(0, 110, 5))

ggsave(crisisplot, file = "crisisdensity+time.svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

years50 <- c(1800, 1850, 1900, 1950, 2000, 2050)

wavedata.vol50$YEAR <- as.numeric(wavedata.vol50$year)

wavedata.vol50$TotalNewspaperVolume <- thetimes[match(wavedata.vol50$year, thetimes$year), "articles"]

volume.lm <- (lm(volume ~ (YEAR), data = wavedata.vol50))
predict(volume.lm, newdata = list(YEAR = years50))

duration.lm <- (lm(duration ~ (YEAR), data = wavedata.vol50))
predict(duration.lm, newdata = list(YEAR = years50))

intensity.lm <- (lm(intensity ~ (YEAR), data = wavedata.vol50))
predict(intensity.lm, newdata = list(YEAR = years50))

max.intensity.lm <- (lm(max.intensity ~ (YEAR), data = wavedata.vol50))
predict(max.intensity.lm, newdata = list(YEAR = years50))

variability.lm <- (lm(variability ~ (YEAR), data = wavedata.vol50))
predict(variability.lm, newdata = list(YEAR = years50))

summary(lm(duration ~ I(as.numeric(year) - 1788), data = wavedata.vol50))
summary(lm(intensity_i ~ I(as.numeric(year) - 1788), data = wavedata.vol50))
summary(lm(max.intensity ~ I(as.numeric(year) - 1788), data = wavedata.vol50))
summary(lm(variability ~ I(as.numeric(year) - 1788), data = wavedata.vol50))
summary(lm((variability / intensity) ~ I(as.numeric(year) - 1788), data = wavedata.vol50))
summary(lm(baseline365 ~ I(as.numeric(year) - 1788), data = wavedata.vol50))

wavedata.vol50$intensity_i <- wavedata.vol50$volume / wavedata.vol50$duration

intensity.trajectory <- ggplot(wavedata.vol50, aes(x = as.numeric(year), y = intensity_i)) +
  geom_jitter(size = 0.5) +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77") +
  theme_bluewhite() +
  xlab("Year") +
  ylab("Average daily intensity of coverage\nduring the identified crisis event") +
  scale_y_log10(breaks = c(0.05, 0.1, 1, 5), limits = c(0.05, 5)) +
  annotation_logticks() +
  scale_x_continuous(breaks = seq(1780, 2020, 10), minor_breaks = (seq(1780, 2020, 2)))


ggsave(intensity.trajectory, file = "intensity+time.svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

duration.trajectory <- ggplot(wavedata.vol50, aes(x = as.numeric(year), y = duration)) +
  geom_jitter(size = 0.5) +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77") +
  theme_bluewhite() +
  xlab("Year") +
  ylab("Average duration of above-baseline coverage\nduring the identified crisis event") +
  ylim(0, 160) +
  scale_x_continuous(breaks = seq(1780, 2020, 10), minor_breaks = (seq(1780, 2020, 2)))

ggsave(duration.trajectory, file = "duration+time.svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

volume.trajectory <- ggplot(wavedata.vol50, aes(x = as.numeric(year), y = volume)) +
  geom_jitter(size = 0.5) +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77") +
  theme_bluewhite() +
  xlab("Year") +
  ylab("Total volume of coverage\nduring the identified crisis case") +
  ylim(0, 500) +
  scale_x_continuous(breaks = seq(1780, 2020, 10), minor_breaks = seq(1780, 2020, 2)) +
  scale_y_log10(breaks = c(5, 10, 50, 100, 500), limits = c(5, 500)) +
  annotation_logticks()
volume.trajectory

ggsave(volume.trajectory, file = "volume+time.svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

volume.trajectory.small <- ggplot(wavedata.vol50, aes(x = as.numeric(year), y = volume)) +
  geom_jitter(size = 0.5) +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77") +
  theme_bluewhite() +
  xlab("Year") +
  ylab("Total volume of coverage\nduring crisis news waves") +
  ylim(0, 500) +
  scale_x_continuous(breaks = seq(1800, 2000, 50), minor_breaks = seq(1775, 2025, 25)) +
  scale_y_log10(breaks = c(5, 10, 50, 100, 500), limits = c(5, 500)) +
  annotation_logticks()

ggsave(volume.trajectory.small, file = "volume+time_small.svg", dpi = 1200, unit = "cm", width = 4.2, height = 2.1, scale = 3.00)


peak.trajectory <- ggplot(wavedata.vol50, aes(x = as.numeric(year), y = max.intensity)) +
  geom_jitter(size = 0.5) +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77") +
  theme_bluewhite() +
  xlab("Year") +
  ylab("Maximum amount of coverage\nduring the identified crisis case") +
  scale_y_log10(breaks = c(1, 10, 100), limits = c(0.1, 100)) +
  scale_x_continuous(breaks = seq(1780, 2020, 10), minor_breaks = seq(1780, 2020, 2)) +
  annotation_logticks()

ggsave(peak.trajectory, file = "peak+time.svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)





relative.variability.trajectory <- ggplot(wavedata.vol50, aes(x = as.numeric(year), y = variability / intensity)) +
  geom_point() +
  geom_smooth() +
  theme_bluewhite()
variability.trajectory <- ggplot(wavedata.vol50, aes(x = as.numeric(year), y = variability)) +
  geom_point() +
  geom_smooth() +
  theme_bluewhite()


### Total volume of coverage

x <- data.frame(table(wide.STM$year))
newsvolume$crisis <- x[match(newsvolume$year, x$Var1), "Freq"]
newsvolume$crisis.share <- newsvolume$crisis / newsvolume$articles

crisis.labelling <- ggplot(newsvolume, aes(x = year, y = 100 * crisis.share)) +
  geom_vline(xintercept = fifty, size = 1.15, color = "lightblue3") +
  geom_point() +
  geom_smooth(span = 0.25, na.rm = TRUE, fill = "#dd1c77", color = "#dd1c77") +
  geom_smooth(na.rm = TRUE, method = "lm", fill = "#44aa66", color = "#44aa66", ) +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10), minor_breaks = seq(1780, 2020, 2)) +
  xlab("Year") +
  ylab("Share of total coverage with crisis labelling")

topic.STM$Total <- sum(topic.STM$x)
topic.STM$Share <- topic.STM$x / topic.STM$Total
topic.STM$Topics <- factor(topic.STM$Group.1, levels = topic.STM$Group.1[order(topic.STM$Share, decreasing = TRUE)], ordered = TRUE)

economic.topics <- ggplot(topic.STM, aes(y = 100 * Share, x = Topics, color = Topics, fill = Topics)) +
  geom_col() +
  geom_text(aes(label = 100 * round(Share, 3), y = 100 * Share + 1), color = "black", size = 3) +
  theme_bluewhite() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylab("Share of topics in coverage with crisis labelling") +
  scale_color_viridis_d() +
  scale_fill_viridis_d() +
  theme(legend.position = "none")

topicareaplot2 <- function(d, area2) {
  d <- d
  d$total <- tapply(d$article, d$decade, sum)[match(d$decade, d$decade[1:25])]
  d$percentage <- round(100 * d$article / d$total, 1)
  gg <- ggplot(d, aes(y = percentage, x = decade, fill = area)) +
    geom_area(color = "white") +
    geom_hline(yintercept = 0, color = "#dd4422") +
    theme_light() +
    ggtitle(area2) +
    scale_x_continuous(breaks = seq(1780, 2020, 10)) +
    scale_fill_viridis_d()
  return(gg)
}

topicareaplot3 <- function(d, area2) {
  d <- d
  d$total <- tapply(d$article, d$decade, sum)[match(d$decade, d$decade[1:25])]
  d$percentage <- round(100 * d$article / d$total, 1)
  gg <- ggplot(d, aes(y = percentage, x = decade, fill = area_ordered)) +
    geom_area(color = "white") +
    geom_hline(yintercept = 0, color = "#dd4422") +
    theme_light() +
    scale_x_continuous(breaks = seq(1780, 2020, 10)) +
    scale_fill_viridis_d()
  return(gg)
}

decade.STM_area2$area_ordered <- factor(decade.STM_area2$area, ordered = TRUE, levels = names(table(decade.STM_area2$area))[order(tapply(decade.STM_area2$articles, decade.STM_area2$area, sum, na.rm = TRUE), decreasing = FALSE)])

economic.topics.time.share <- topicareaplot3(d = decade.STM_area2, area2 = names(table(decade.STM_area2$area))) + geom_vline(xintercept = fifty, size = 1.15, color = "lightblue3") + theme_bluewhite() + scale_fill_manual(values = c("#f7fbff", "#00441b", "#deebf7", "#006d2c", "#c6dbef", "#238b45", "#9ecae1", "#66a182", "#00798c", "#6baed6", "#41ab5d", "#4292c6", "#74c476", "#2171b5", "#a1d99b", "#08519c", "#c7e9c0", "#08306b", "#e5f5e0", "#111111", "#8d96a3", "red")) + theme(legend.position = "right") + ylab("Share of articles with crisis labelling") + guides(fill = guide_legend("Topic area"))

economic.topics.time.abs <- ggplot(data = decade.STM_area2, aes(y = articles, x = decade, fill = area_ordered)) +
  geom_vline(xintercept = fifty, size = 1.15, color = "lightblue3") +
  geom_area() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  theme_bluewhite() +
  scale_fill_manual(values = c("#555555", "#565656", "#666666", "#676767", "#777777", "#787878", "#888888", "#898989", "#999999", "#9a9a9a", "#aaaaaa", "#ababab", "#bbbbbb", "#bcbcbc", "#cccccc", "#cdcdcd", "#dddddd", "#dedede", "#eeeeee", "#fefefe", "#ffffff", "red")) +
  theme(legend.position = c(0.2, .525)) +
  ylab("Count of articles with crisis labelling") +
  guides(fill = guide_legend("Topic area"))

ecotoplist <- names(table(decade.STM_area$area))[2:12]

withineconomic.topics.time.share <- topicareaplot2(d = subset(decade.STM_area, area %in% ecotoplist), area2 = ecotoplist) + theme_bluewhite() + theme(legend.position = "right") + geom_vline(xintercept = fifty, size = 1.15, color = "lightblue3") + ylab("Share of economic crisis labelling")

withineconomic.topics.time.abs <- ggplot(data = subset(decade.STM_area, area %in% ecotoplist), aes(y = articles, x = decade, fill = area)) +
  geom_vline(xintercept = fifty, size = 1.15, color = "lightblue3") +
  geom_area() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  theme_bluewhite() +
  scale_fill_viridis_d() +
  ylab("Number of articles with crisis labelling") +
  theme(legend.position = (c(0.15, 0.5))) +
  guides(fill = guide_legend("Topic within Economy"))

wavedata.vol50$ECO <- ifelse(wavedata.vol50$AREA2 %in% ecotoplist, "Economic Crisis Event", "Non-Economic Crisis Event")

eco.CE.decade <- data.frame(table(wavedata.vol50$decade, wavedata.vol50$ECO))
names(eco.CE.decade) <- c("Decade", "EventType", "Count")
eco.CE.decade$rowmax <- as.numeric(as.character(Recode(eco.CE.decade$EventType, "'Economic Crisis Event'=27;'Non-Economic Crisis Event'=98;else=NA")))
eco.CE.decade$Index <- 100 * eco.CE.decade$Count / eco.CE.decade$rowmax
eco.CE.decade$AllEvents <- rep(subset(eco.CE.decade, EventType == "Economic Crisis Event")$Count + subset(eco.CE.decade, EventType == "Non-Economic Crisis Event")$Count, times = 2)

eco.CE.time <- ggplot(data = subset(eco.CE.decade, EventType == "Economic Crisis Event"), aes(x = as.numeric(as.character(Decade)), y = Count)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_point() +
  geom_smooth(linetype = "longdash", fill = "#44aa66", color = "#44aa66", method = "lm", na.rm = TRUE) +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77") +
  theme_bluewhite() +
  xlab("Decade") +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  ylab("Economic Crisis Event Count")

comp.CE.time.Index <- ggplot(data = eco.CE.decade, aes(x = as.numeric(as.character(Decade)), y = Index, color = EventType, fill = EventType, group = EventType)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_point() +
  geom_smooth() +
  theme_bluewhite() +
  xlab("Decade") +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  theme(legend.position = c(0.825, 0.125)) +
  scale_color_viridis_d(begin = .2, end = .8) +
  scale_fill_viridis_d(begin = .2, end = .8) +
  ylab("Index of Crisis Events (100=maximum)")

comp.CE.time.abs <- ggplot(data = eco.CE.decade, aes(x = as.numeric(as.character(Decade)), y = Count, color = EventType, fill = EventType, group = EventType)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_point() +
  geom_smooth() +
  theme_bluewhite() +
  xlab("Decade") +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  theme(legend.position = c(0.8, 0.8)) +
  scale_color_viridis_d(begin = .2, end = .8) +
  scale_fill_viridis_d(begin = .2, end = .8) +
  ylab("Count of Crisis Events")

comp.CE.time.col <- ggplot(data = eco.CE.decade, aes(x = as.numeric(as.character(Decade)), y = 100 * Count / AllEvents, color = EventType, fill = EventType, group = EventType)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_col() +
  theme_bluewhite() +
  xlab("Decade") +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  theme(legend.position = c(0.2, 0.2)) +
  scale_color_viridis_d(begin = .2, end = .8) +
  scale_fill_viridis_d(begin = .2, end = .8) +
  ylab("Share of Crisis Events")

economic.crisis.types.decades <- ggplot(data = subset(wavedata.vol50, AREA == "Economy"), aes(x = decade, color = AREA2, fill = AREA2)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_bar(position = "fill") +
  theme_bluewhite() +
  xlab("Decade") +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  scale_fill_viridis_d() +
  scale_color_viridis_d()

crisis.type.volume.trajectory <- ggplot(wavedata.vol50, aes(size = volume, x = YEAR, y = volume, color = ECO, fill = ECO, group = ECO)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_point() +
  geom_smooth() +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  scale_color_viridis_d(option = "inferno", begin = .2, end = .8) +
  scale_fill_viridis_d(option = "inferno", begin = .2, end = .8) +
  theme(legend.position = "bottom") +
  guides(size = "none") +
  scale_y_log10() +
  annotation_logticks()

crisis.type.duration.trajectory <- ggplot(wavedata.vol50, aes(size = duration, x = YEAR, y = duration, color = ECO, fill = ECO, group = ECO)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_point(alpha = .3) +
  geom_smooth() +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  scale_color_viridis_d(option = "inferno", begin = .2, end = .8) +
  scale_fill_viridis_d(option = "inferno", begin = .2, end = .8) +
  theme(legend.position = "bottom") +
  guides(size = "none")


eco.CE.year <- data.frame(table(wavedata.vol50$YEAR, wavedata.vol50$ECO))

eco.CE.year2 <- data.frame(year = eco.CE.year[1:172, 1], ECO = eco.CE.year[1:172, 3], NonECO = eco.CE.year[173:344, 3])

fifty <- c(1800, 1850, 1900, 1950, 2000)

crisis.spillover <- ggplot(eco.CE.year2, aes(size = ECO, x = as.numeric(as.character(year)), y = NonECO, color = ECO, fill = ECO)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_point() +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  scale_color_viridis(option = "inferno", begin = .2, end = .8) +
  scale_fill_viridis(option = "inferno", begin = .2, end = .8) +
  theme(legend.position = c(0.15, 0.7)) +
  guides(size = guide_legend("Count of\nEconomic\nCrisis Events"), color = guide_legend("Count of\nEconomic\nCrisis Events"), fill = guide_legend("Count of\nEconomic\nCrisis Events")) +
  xlab("Year") +
  ylab("Count of Non-Economic Crisis Events")

CE.volume <- aggregate(wavedata.vol50$volume, by = list(wavedata.vol50$YEAR), FUN = "sum")

newsvolume$CE.coverage <- CE.volume[match(newsvolume$year, CE.volume$Group.1), "x"]
newsvolume$CE.coverage <- replace(newsvolume$CE.coverage, is.na(newsvolume$CE.coverage), 0)
newsvolume$crisis <- replace(newsvolume$crisis, is.na(newsvolume$crisis), 0)
newsvolume$CEtoCL <- newsvolume$CE.coverage / newsvolume$crisis

CE_to_CL_trajectory <- ggplot(newsvolume, aes(x = year, y = 100 * CEtoCL)) +
  geom_vline(xintercept = fifty, color = "lightblue3", size = 1.15) +
  geom_smooth(color = "deepskyblue4", fill = "deepskyblue4") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10), minor_breaks = seq(1780, 2020, 2)) +
  scale_y_continuous(breaks = seq(0, 100, 20), minor_breaks = seq(0, 100, 10), limits = c(0, 100)) +
  ylab("Share of crisis labelling coverage\nthat is linked to a crisis event")

ggplot(newsvolume, aes(x = year, y = 100 * CE.coverage / articles)) +
  geom_point() +
  geom_smooth()
ggplot(newsvolume, aes(x = year, y = 100 * crisis / articles)) +
  geom_point() +
  geom_smooth()
ggplot(newsvolume, aes(x = year, y = 100 * CE.coverage / crisis)) +
  geom_point() +
  geom_smooth()

### What share of crisis labelling can be assined to specific crisis events?
ggsave(CE_to_CL_trajectory, file = "CE_to_CL_trajectory.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### Distribution of topics in crisis labelling
ggsave(crisis.labelling, file = "CL_trejectory.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### Distribution of topics in crisis labelling
ggsave(economic.topics, file = "eco_CL_share_cs.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the salience of economic topics with crisis labelling developed? Shares.
ggsave(economic.topics.time.share, file = "eco_CL_share.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the salience of economic topics with crisis labelling developed? Absolute numbers.
ggsave(economic.topics.time.abs, file = "eco_CL_abs.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the distribution of economic topics with crisis labelling developed? Shares.
ggsave(withineconomic.topics.time.share, file = "within_eco_CL_share.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the distribution of economic topics with crisis labelling developed? Absolute numbers.
ggsave(withineconomic.topics.time.abs, file = "within_eco_CL_abs.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the count of economic  crisis events developed?
ggsave(eco.CE.time, file = "economic_CE_trajectory.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the count of economic (vs non-economic) crisis events developed?
ggsave(comp.CE.time.Index, file = "economic_vs_noneconomic_CE_trajectory_index.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the count of economic (vs non-economic) crisis events developed?
ggsave(comp.CE.time.abs, file = "economic_vs_noneconomic_CE_trajectory_abs.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the share of economic (vs non-economic) crisis events developed?
ggsave(comp.CE.time.col, file = "economic_vs_noneconomic_CE_trajectory.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the composition of economic crisis event topics developed?
ggsave(economic.crisis.types.decades, file = "within_economic_topics_trajectory.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the volume of economic and non-economic crisis events developed?
ggsave(crisis.type.volume.trajectory, file = "crisis_type_volume_trajectory.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### How has the duration of economic and non-economic crisis events developed?
ggsave(crisis.type.duration.trajectory, file = "crisis_type_duration_trajectory.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

### Do economic and non-economic crises co-occur more frequently today?
ggsave(crisis.spillover, file = "crisis_cooccurrence.svg", device = "svg", dpi = 1200, unit = "cm", width = 16, height = 8, scale = 1.25)

ggplot(epiq, aes(x = YEAR, y = volume, size = volume)) +
  geom_point() +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10))


max.h <- entropy(rep(1 / 120, times = 120), method = "ML")
decades <- seq(1780, 2010, 10)
h <- data.frame(decades = decades, raw.entropy = NA, std.entropy = NA, max.entropy = max.h)

decade.STM$total <- tapply(decade.STM$article, decade.STM$decade, sum, na.rm = TRUE)
decade.STM$share <- decade.STM$article / decade.STM$total

for (i in 1:length(decades))
{
  h[i, "raw.entropy"] <- entropy(subset(decade.STM, decade == decades[[i]])$share, method = "ML")
  h[i, "std.entropy"] <- h[i, "raw.entropy"] / h[i, "max.entropy"]
}

entropy_coverage <- ggplot(subset(h, decades != 2030), aes(x = decades, y = std.entropy)) +
  geom_vline(xintercept = fifty, size = 1.15, color = "lightblue3") +
  geom_point() +
  geom_smooth(method = "lm", fill = "#44aa66", color = "#44aa66") +
  geom_smooth(fill = "#dd1c77", color = "#dd1c77", linetype = "longdash") +
  geom_hline(yintercept = 1, color = "red") +
  theme_light() +
  ylim(0.7, 1) +
  ylab("Standardized Entropy") +
  xlab("Decade") +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10))

ggsave(entropy_coverage, file = "topicentropy_decade.svg", units = "cm", width = 16, height = 8, dpi = 1200, scale = 1.35)


CE.Economy <- subset(wavedata.vol50, AREA == "Economy" | AREA == "Energy")
CE.Disaster <- subset(wavedata.vol50, AREA == "Disaster")
CE.Epidemic <- subset(wavedata.vol50, AREA == "Epidemic" | AREA == "Epidemics" | AREA == "Health")
CE.Functional <- subset(wavedata.vol50, AREA == "Functional")
CE.Geopolitical <- subset(wavedata.vol50, AREA == "Geopolitical")
CE.Government <- subset(wavedata.vol50, AREA == "Government")
CE.Justice <- subset(wavedata.vol50, AREA == "Justice")
CE.Military <- subset(wavedata.vol50, AREA == "Military")
CE.Transport <- subset(wavedata.vol50, AREA == "Transport")


ggplot(subset(CE.Epidemic, volume > 50), aes(y = volume, x = year, size = volume)) +
  geom_point(fill = "white", shape = 1) +
  scale_y_log10()

table(CE.Epidemic$wordlist)[table(CE.Epidemic$wordlist) > 0]



save(wavedata, file = "wavedata.RData")
save(wavedata.vol50, file = "wavedata_vol50.RData")
save(wave.keywords, file = "wave_keywords.RData")
save(tss, file = "tss_total.RData")
save(wd5, file = "wd5.RData")

textdirectory1 <- NA
textdirectory2 <- NA

for (w in 1:W)
{
  text.to.output <- wave.keywords[[w]]$texts[1:2, ]
  text1 <- paste(
    (text.to.output[1, 1]), "\n",
    (text.to.output[1, 2]), "\n",
    paste(wave.keywords[[w]]$wordlist, collapse = " "), "\n",
    wave.keywords[[w]]$topic, "\n",
    wave.keywords[[w]]$area, "\n",
    (textfiles.1$newspaper[which(textfiles.1$id == text.to.output[1, 1])]), "\n",
    (textfiles.1$date[which(textfiles.1$id == text.to.output[1, 1])]), "\n",
    (textfiles.1$headline[which(textfiles.1$id == text.to.output[1, 1])]), "\n",
    (textfiles.1$text[which(textfiles.1$id == text.to.output[1, 1])])
  )
  filename1 <- paste0("et", w, "---1", ".txt")
  text2 <- paste(
    (text.to.output[2, 1]), "\n",
    (text.to.output[2, 2]), "\n",
    paste(wave.keywords[[w]]$wordlist, collapse = " "), "\n",
    wave.keywords[[w]]$topic, "\n",
    wave.keywords[[w]]$area, "\n",
    (textfiles.1$newspaper[which(textfiles.1$id == text.to.output[2, 1])]), "\n",
    (textfiles.1$date[which(textfiles.1$id == text.to.output[2, 1])]), "\n",
    (textfiles.1$headline[which(textfiles.1$id == text.to.output[2, 1])]), "\n",
    (textfiles.1$text[which(textfiles.1$id == text.to.output[2, 1])])
  )
  filename2 <- paste0("et", ifelse(w > 999, "0", ifelse(w > 99, "00", ifelse(w > 9, "000", "0000"))), w, "-2", ".txt")

  save.csv(text1, file = filename1)

  textdirectory1[w] <- c(text1)
  textdirectory2[w] <- c(text2)
  print(paste0(w, "/", W))
  flush.console()
}


fileConn <- file("EventText1.txt")
writeLines(textdirectory1, fileConn)
close(fileConn)

fileConn <- file("EventText2.txt")
writeLines(textdirectory2, fileConn)
close(fileConn)


topic_diversity_data <- data.frame(year = NA, indicator = NA, categories = NA, corpus = NA, value = NA)

thetimes_L <- pivot_longer(thetimes3, cols = c("active250", "active50", "active20"), names_to = "categories", values_to = "x")

topic_diversity_data <- rbind(
  topic_diversity_data,
  data.frame(year = thetimes_L$year, indicator = "active", categories = thetimes_L$categories, corpus = "crisis labelling", value = thetimes_L$x),
  data.frame(year = thetimes.d3$decade, indicator = "active", categories = thetimes.d3$name, corpus = "crisis news waves", value = thetimes.d3$value),
  data.frame(year = df.active_L$year, indicator = "active", categories = df.active_L$resolution, corpus = "routine", value = df.active_L$active),
  data.frame(year = thetimes3$year, indicator = "gini", categories = thetimes3$var, corpus = "crisis labelling", value = 1 - thetimes3$value),
  data.frame(year = rep(thetimes$year, times = 3), indicator = "gini", categories = rep(c("21 topic areas", "163 topic complexes", "250 topics"), each = 236), corpus = "crisis news waves", value = c(thetimes$nw.rgini20r, thetimes$nw.rgini50r, thetimes$nw.rgini250r)),
  data.frame(year = str_extract(df.gini_L$year, pattern = "[:digit:]{4,4}"), indicator = "gini", categories = df.gini_L$name, corpus = "routine", value = df.gini_L$value)
)


topic_diversity_data$corp <- factor(topic_diversity_data$corpus, ordered = TRUE, levels = c("routine", "crisis labelling", "crisis news waves"))

topic_diversity_data$cat <- factor(Recode(topic_diversity_data$categories, "'active20'='21 topic areas';'active50'='163 topic complexes';'active250'='250 topics';'nw20d'='21 topic areas';'nw50d'='163 topic complexes';'nw250d'='250 topics';'topic areas(20)'='21 topic areas';'topic complexes(50)'='163 topic complexes';'topic(250)'='250 topics';'topic20'='21 topic areas';'topic50'='163 topic complexes';'topic250'='250 topics'"), ordered = TRUE, levels = c("250 topics", "163 topic complexes", "21 topic areas"))

top_div_dat <- subset(topic_diversity_data, !is.na(value))

ggplot(subset(top_div_dat, cat == "21 topic areas"), aes(y = value, x = as.Date(year, format = "%Y"), linetype = corpus, shape = corpus, color = indicator)) +
  geom_point() +
  geom_smooth()

ggplot(subset(top_div_dat, cat == "163 topic complexes"), aes(y = value, x = as.Date(year, format = "%Y"), linetype = corpus, shape = corpus, color = indicator)) +
  geom_point() +
  geom_smooth()

gg_topic_diversity <- ggplot(top_div_dat, aes(y = value, x = as.Date(year, format = "%Y"), linetype = corp, shape = corp, color = corp)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  theme_bluewhite() +
  scale_fill_viridis_d(option = "inferno", begin = 0, end = 0.85) +
  scale_color_viridis_d(option = "inferno", begin = 0, end = 0.85) +
  facet_grid(cat ~ indicator) +
  xlab("Year") +
  ylab("Share of active topics Diversity (1-Gini)") +
  theme(legend.position = "bottom")

ggsave(gg_topic_diversity, file = "gg_topic_diversity.svg", scale = 1.25, dpi = 1200, unit = "cm", width = 16, height = 16)






thetimes$statistics <- social_statistics_timeline[match(thetimes$year, social_statistics_timeline$year), "indicators"]

thetimes$newspaper_circulation <- media_indicators[match(thetimes$year, media_indicators$year), "newspaper_circulation_i"]
thetimes$TV_households_share <- media_indicators[match(thetimes$year, media_indicators$year), "TV_households_share_i"]
thetimes$Internet_penetration <- media_indicators[match(thetimes$year, media_indicators$year), "Internet_penetration_i"]

thetimes$media_autonomy <- df_bes_year[match(thetimes$year, df_bes_year$year), "no_party_id"]

thetimes$media_autonomy_i <- na_locf(thetimes$media_autonomy)

thetimes$Newspaper_212 <- arima(thetimes$newspaper_circulation, order = c(2, 1, 2))$resid
thetimes$TV_212 <- arima(thetimes$TV_households_share, order = c(2, 1, 2))$resid
thetimes$Internet_212 <- arima(thetimes$Internet_penetration, order = c(2, 1, 2))$resid
thetimes$ORG_212 <- arima(thetimes$ccORG_per_article, order = c(2, 1, 2))$resid
thetimes$PERSON_212 <- arima(thetimes$ccPERSON_per_article, order = c(2, 1, 2))$resid
thetimes$SpendingShare_212 <- arima(thetimes$spending_GDP, order = c(2, 1, 2))$resid
thetimes$SpendingDiversity_212 <- arima(thetimes$gini_rev, order = c(2, 1, 2))$resid
thetimes$Autonomy_212 <- arima(thetimes$media_autonomy_i, order = c(2, 1, 2))$resid
thetimes$Statistics_212 <- arima(thetimes$statistics, order = c(2, 1, 2))$resid
thetimes$CLSHARE_212 <- arima(thetimes$cl.share, order = c(2, 1, 2))$resid

thetimes$Penetration <- rowMaxs(as.matrix(thetimes[, c("newspaper_circulation", "TV_households_share", "Internet_penetration")]))
thetimes$Penetration_212 <- arima(thetimes$Penetration, order = c(2, 1, 2))$resid

thetimes$Autonomy_bi <- 1 * (thetimes$media_autonomy_i > 0.06)


CLS_m6 <- (lm(CLSHARE_212 ~ Penetration_212 * Autonomy_212 + ORG_212 + PERSON_212 + SpendingShare_212 + SpendingDiversity_212 + Statistics_212 + Internet_212, data = thetimes))

CLS_m5 <- (lm(CLSHARE_212 ~ Penetration_212 * Autonomy_212 + ORG_212 + PERSON_212 + SpendingShare_212 + SpendingDiversity_212 + Statistics_212, data = thetimes))

CLS_m4 <- (lm(CLSHARE_212 ~ Penetration_212 * Autonomy_212 + SpendingShare_212 + SpendingDiversity_212 + Statistics_212, data = thetimes))

CLS_m3 <- (lm(CLSHARE_212 ~ Penetration_212 * Autonomy_212 + SpendingShare_212 + SpendingDiversity_212, data = thetimes))

CLS_m2b <- (lm(CLSHARE_212 ~ ORG_212 + PERSON_212, data = thetimes))

CLS_m2c <- (lm(CLSHARE_212 ~ Statistics_212, data = thetimes))

CLS_m2 <- (lm(CLSHARE_212 ~ (Penetration_212) * Autonomy_212, data = thetimes))

CLS_m1 <- (lm(CLSHARE_212 ~ SpendingShare_212 + SpendingDiversity_212, data = thetimes))

CLS_m0 <- (lm(CLSHARE_212 ~ 1, data = thetimes))

CLS_m5b <- (lm(CLSHARE_212 ~ +ORG_212 + PERSON_212 + SpendingShare_212 + SpendingDiversity_212 + Statistics_212, data = thetimes))

thetimes$gini_rev_i <- na_locf(thetimes$gini_rev)

arima_CL1 <- Arima(thetimes$cl.share, xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev")]), order = c(2, 1, 2), include.drift = TRUE)

arima_CL2 <- Arima(thetimes$cl.share, xreg = as.matrix(thetimes[, c("Penetration_212", "Autonomy_212", "Statistics_212", "ORG_212", "PERSON_212", "SpendingShare_212", "SpendingDiversity_212")]), order = c(2, 1, 2), include.drift = TRUE)

lm_CL1 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + ccORG_per_article + ccPERSON_per_article + spending_GDP + gini_rev_i + year, data = thetimes)

lm_CL2 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + ccORG_per_article + ccPERSON_per_article + spending_GDP + gini_rev_i, data = subset(thetimes, year < 2000))

lm_CL3 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + ccORG_per_article + ccPERSON_per_article + spending_GDP + gini_rev_i, data = subset(thetimes, year > 1900))

lm_CL4 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + ccORG_per_article + ccPERSON_per_article + spending_GDP + gini_rev_i, data = subset(thetimes, year < 1900))

lm_CL5 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + ccORG_per_article + ccPERSON_per_article + spending_GDP + gini_rev_i, data = subset(thetimes, year < 1950))

lm_CL6 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + ccORG_per_article + ccPERSON_per_article + spending_GDP + gini_rev_i, data = subset(thetimes, year < 1975))

lm_CL7 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + ccORG_per_article + ccPERSON_per_article + spending_GDP + gini_rev_i, data = subset(thetimes, year < 1990))

lm_CL8 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + ccORG_per_article + ccPERSON_per_article + spending_GDP + gini_rev_i, data = thetimes)

# Prediction trained on the pre-2000 data (forecast for 2000-2020)
thetimes$pred_1 <- stats::predict(lm_CL2, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])

# Prediction trained on the post-1900 data (forecast for 1785-1900)
thetimes$pred_2 <- stats::predict(lm_CL3, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])

# Prediction trained on the pre-1900 data (forecast for 1900-2020)
thetimes$pred_3 <- stats::predict(lm_CL4, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])

# Prediction trained on the pre-1900 data (forecast for 1900-2020)
thetimes$pred_4 <- stats::predict(lm_CL5, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])

# Prediction trained on the pre-1900 data (forecast for 1900-2020)
thetimes$pred_5 <- stats::predict(lm_CL6, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])

# Prediction trained on the pre-1900 data (forecast for 1900-2020)
thetimes$pred_6 <- stats::predict(lm_CL7, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])

# Prediction trained on the pre-1900 data (forecast for 1900-2020)
thetimes$pred_7 <- stats::predict(lm_CL8, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])



ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_1))

ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_2))

ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_3))






lm_CL1X <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i + year, data = thetimes)

lm_CL2X <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i, data = subset(thetimes, year < 2000))

lm_CL3X <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i, data = subset(thetimes, year > 1900))

lm_CL4X <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i, data = subset(thetimes, year < 1900))

lm_CL5X <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i, data = subset(thetimes, year < 1950))

lm_CL6X <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i, data = subset(thetimes, year < 1975))

lm_CL7X <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i, data = subset(thetimes, year < 1990))

lm_CL8X <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i, data = thetimes)


# Prediction trained on the pre-2000 data (forecast for 2000-2020)
thetimes$pred_1X <- stats::predict(lm_CL2X, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])

# Prediction trained on the post-1900 data (forecast for 1785-1900)
thetimes$pred_2X <- stats::predict(lm_CL3X, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])

# Prediction trained on the pre-1900 data (forecast for 1900-2020)
thetimes$pred_3X <- stats::predict(lm_CL4X, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])

# Prediction trained on the pre-1950 data (forecast for 1950-2020)
thetimes$pred_4X <- stats::predict(lm_CL5X, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])

# Prediction trained on the pre-1975 data (forecast for 1975-2020)
thetimes$pred_5X <- stats::predict(lm_CL6X, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])

# Prediction trained on the pre-1990 data (forecast for 1990-2020)
thetimes$pred_6X <- stats::predict(lm_CL7X, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])

# Prediction trained on the full data (no forecast)
thetimes$pred_7X <- stats::predict(lm_CL8X, newdata = thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])


ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_1X))

ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_2X))

ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_3X))

ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_4X))

ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_5X))

gg_1785_1990 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_6X)) +
  annotate("rect", xmin = 1990, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 1990, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

gg_1785_2000 <- gg_1785_1975 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_1X)) +
  annotate("rect", xmin = 2000, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 2000, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

gg_1900_2020 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_2X)) +
  annotate("rect", xmin = 1785, xmax = 1900, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1900, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

gg_1785_1900 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_3X)) +
  annotate("rect", xmin = 1900, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 1900, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

gg_1785_1950 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_4X)) +
  annotate("rect", xmin = 1950, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 1950, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

gg_1785_1975 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_5X)) +
  annotate("rect", xmin = 1975, xmax = 2020, ymin = 0, ymax = 0.20, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 1975, ymin = 0, ymax = 0.20, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

gg_1785_2020 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_7X)) +
  annotate("rect", xmin = 1785, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")


ggX_1785_1990 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_6X)) +
  annotate("rect", xmin = 1990, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 1990, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

ggX_1785_2000 <- gg_1785_1975 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_1X)) +
  annotate("rect", xmin = 2000, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 2000, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

ggX_1900_2020 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_2X)) +
  annotate("rect", xmin = 1785, xmax = 1900, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1900, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

ggX_1785_1900 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_3X)) +
  annotate("rect", xmin = 1900, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 1900, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

ggX_1785_1950 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_4X)) +
  annotate("rect", xmin = 1950, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 1950, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

ggX_1785_1975 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_5X)) +
  annotate("rect", xmin = 1975, xmax = 2020, ymin = 0, ymax = 0.20, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 1975, ymin = 0, ymax = 0.20, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

ggX_1785_2020 <- ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_7X)) +
  annotate("rect", xmin = 1785, xmax = 2020, ymin = 0, ymax = 0.01, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

prediction_long <- pivot_longer(thetimes, col = c("pred_1", "pred_2", "pred_3", "pred_4", "pred_5", "pred_6", "pred_7", "pred_1X", "pred_2X", "pred_3X", "pred_4X", "pred_5X", "pred_6X", "pred_7X"))

prediction_long$train_start <- car::Recode(prediction_long$name, "'pred_1'=1785;'pred_1X'=1785;'pred_2'=1900;'pred_2X'=1900;'pred_3'=1785;'pred_3X'=1785;'pred_4'=1785;'pred_4X'=1785;'pred_5'=1785;'pred_5X'=1785;'pred_6'=1785;'pred_6X'=1785;'pred_7'=1785;'pred_7X'=1785")

prediction_long$train_end <- car::Recode(prediction_long$name, "'pred_1'=2000;'pred_1X'=2000;'pred_2'=2020;'pred_2X'=2020;'pred_3'=1900;'pred_3X'=1900;'pred_4'=1950;'pred_4X'=1950;'pred_5'=1975;'pred_5X'=1975;'pred_6'=1990;'pred_6X'=1990;'pred_7'=2020;'pred_7X'=2020")

prediction_long$train_end <- ifelse(prediction_long$year == 2020, prediction_long$train_end, NA)

prediction_long$predict_start <- car::Recode(prediction_long$name, "'pred_1'=2000;'pred_1X'=2000;'pred_2'=1785;'pred_2X'=1785;'pred_3'=1900;'pred_3X'=1900;'pred_4'=1950;'pred_4X'=1950;'pred_5'=1975;'pred_5X'=1975;'pred_6'=1990;'pred_6X'=1990;'pred_7'=2020;'pred_7X'=2020")

prediction_long$predict_end <- car::Recode(prediction_long$name, "'pred_1'=2020;'pred_1X'=2020;'pred_2'=1900;'pred_2X'=1900;'pred_3'=2020;'pred_3X'=2020;'pred_4'=2020;'pred_4X'=2020;'pred_5'=2020;'pred_5X'=2020;'pred_6'=2020;'pred_6X'=2020;'pred_7'=2020;'pred_7X'=2020")

prediction_long$predict_end <- ifelse(prediction_long$year == 2020, prediction_long$predict_end, NA)

prediction_long$train <- car::Recode(prediction_long$name, "'pred_1'='1785-2000';'pred_1X'='1785-2000';'pred_2'='1900-2020';'pred_2X'='1900-2020';'pred_3'='1785-1900';'pred_3X'='1785-1900';'pred_4'='1785-1950';'pred_4X'='1785-1950';'pred_5'='1785-1975';'pred_5X'='1785-1975';'pred_6'='1785-1990';'pred_6X'='1785-1990';'pred_7'='1785-2020';'pred_7X'='1785-2020'")

prediction_long$set <- car::Recode(prediction_long$name, "'pred_1'='with_ORG';'pred_1X'='without_ORG';'pred_2'='with_ORG';'pred_2X'='without_ORG';'pred_3'='with_ORG';'pred_3X'='without_ORG';'pred_4'='with_ORG';'pred_4X'='without_ORG';'pred_5'='with_ORG';'pred_5X'='without_ORG';'pred_6'='with_ORG';'pred_6X'='without_ORG';'pred_7'='with_ORG';'pred_7X'='without_ORG'")

prediction_long$range <- car::Recode(prediction_long$name, "'pred_1'=0.01;'pred_1X'=0.01;'pred_2'=0.01;'pred_2X'=0.01;'pred_3'=0.01;'pred_3X'=0.01;'pred_4'=0.01;'pred_4X'=0.01;'pred_5'=0.2;'pred_5X'=0.2;'pred_6'=0.01;'pred_6X'=0.01;'pred_7'=0.01;'pred_7X'=0.01")

ggplot(thetimes, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = pred_5X)) +
  annotate("rect", xmin = 1975, xmax = 2020, ymin = 0, ymax = 0.20, fill = "darkblue", alpha = .25) +
  annotate("rect", xmin = 1785, xmax = 1975, ymin = 0, ymax = 0.20, fill = "darkred", alpha = .25) +
  theme_soft() +
  ylab("Crisis Labelling Salience")

gg_calibration <- ggplot(prediction_long, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = value)) +
  geom_rect(aes(xmin = train_start, xmax = train_end, ymax = range), ymin = 0, fill = "darkblue", alpha = .2) +
  geom_rect(aes(xmin = predict_start, xmax = predict_end, ymax = range), ymin = 0, fill = "darkred", alpha = .2) +
  facet_grid(train ~ set, scale = "free_y") +
  theme_soft() +
  ylab("Crisis Labelling Salience")

ggplot(subset(prediction_long, name == "pred_1"), aes(x = year)) +
  geom_rect(aes(xmin = train_start, xmax = train_end), ymin = 0, ymax = 0.025, fill = "darkblue", alpha = .25 / 236) +
  geom_rect(aes(xmin = predict_start, xmax = predict_end), ymin = 0, ymax = 0.025, fill = "darkred", alpha = .25 / 236) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = value)) +
  facet_grid(train ~ set, scale = "free_y") +
  theme_soft() +
  ylab("Crisis Labelling Salience")

gg_calibration <- ggplot(prediction_long, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = value)) +
  geom_rect(aes(xmin = train_start, xmax = train_end, ymax = range), ymin = 0, fill = "darkblue", alpha = .25) +
  geom_rect(aes(xmin = predict_start, xmax = predict_end, ymax = range), ymin = 0, fill = "darkred", alpha = .25) +
  facet_grid(train ~ set, scale = "free_y") +
  theme_soft() +
  ylab("Crisis Labelling Salience")

ggsave(file = "gg_calibration.svg", device = "svg", gg_calibration, unit = "cm", width = 12, height = 16, scale = 1.25, dpi = 1200)

stargazer(lm_CL1, lm_CL2, lm_CL7, lm_CL6, lm_CL5, lm_CL4, lm_CL3, type = "text", column.labels = c("1785-2020", "1785-2000", "1785-1990", "1785-1975", "1785-1950", "1785-1900", "1900-2020"))
stargazer(lm_CL1X, lm_CL2X, lm_CL7X, lm_CL6X, lm_CL5X, lm_CL4X, lm_CL3X, type = "text", column.labels = c("1785-2020", "1785-2000", "1785-1990", "1785-1975", "1785-1950", "1785-1900", "1900-2020"))

with(subset(thetimes, year >= 2000), cor.test(pred_1, cl.share))$estimate^2
with(subset(thetimes, year >= 2000), cor.test(pred_1X, cl.share))$estimate^2

with(subset(thetimes, year <= 1900), cor.test(pred_2, cl.share))$estimate^2
with(subset(thetimes, year <= 1900), cor.test(pred_2X, cl.share))$estimate^2

with(subset(thetimes, year >= 1900), cor.test(pred_3, cl.share))$estimate^2
with(subset(thetimes, year >= 1900), cor.test(pred_3X, cl.share))$estimate^2

with(subset(thetimes, year >= 1950), cor.test(pred_4, cl.share))$estimate^2
with(subset(thetimes, year >= 1950), cor.test(pred_4X, cl.share))$estimate^2

with(subset(thetimes, year >= 1975), cor.test(pred_5, cl.share))$estimate^2
with(subset(thetimes, year >= 1975), cor.test(pred_5X, cl.share))$estimate^2

with(subset(thetimes, year >= 1990), cor.test(pred_6, cl.share))$estimate^2
with(subset(thetimes, year >= 1990), cor.test(pred_6X, cl.share))$estimate^2

with(thetimes, cor.test(pred_7, cl.share))
with(thetimes, cor.test(pred_7X, cl.share))




grid.arrange(gg_1900_2020, gg_1785_1900, gg_1785_1950, gg_1785_1975, gg_1785_1990, gg_1785_2000, gg_1785_2020, ggX_1900_2020, ggX_1785_1900, ggX_1785_1950, ggX_1785_1975, ggX_1785_1990, ggX_1785_2000, ggX_1785_2020, ncol = 2, as.table = FALSE)



library(stargazer)

stargazer(CLS_m0, CLS_m1, CLS_m2b, CLS_m2, CLS_m3, CLS_m4, CLS_m5, CLS_m6, type = "text")



summary(lm(cl.share ~ newspaper_circulation + TV_households_share + Internet_penetration + statistics + (ccPERSON_index) + (ccORG_index) + spending_GDP + gini_rev + year, data = thetimes))



arima_CL.1900_2020 <- Arima(subset(thetimes, year > 1900)[, "cl.share"], xreg = as.matrix(subset(thetimes, year > 1900)[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

arima_CL.1785_1900 <- Arima(subset(thetimes, year < 1900)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1900)[, c("Penetration", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

arima_CL.1785_1950 <- Arima(subset(thetimes, year < 1950)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1950)[, c("Penetration", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

arima_CL.1785_1975 <- Arima(subset(thetimes, year < 1975)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1975)[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

arima_CL.1785_1990 <- Arima(subset(thetimes, year < 1990)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1990)[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

arima_CL.1785_2000 <- Arima(subset(thetimes, year < 2000)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 2000)[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

arima_CL.1785_2020 <- Arima(subset(thetimes, year < 2021)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 2021)[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

dummat <- matrix(rnorm(n = 236, mean = 0, sd = 1))
colnames(dummat) <- "dummy"

thetimes$dummy <- rnorm(n = 236, mean = 0, sd = 1)
thetimes$dummy2 <- rnorm(n = 236, mean = 0, sd = 1)

e_arima_CL.1900_2020 <- Arima(subset(thetimes, year > 1900)[, "cl.share"], xreg = as.matrix(subset(thetimes, year > 1900)[, c("dummy", "dummy2")]), order = c(2, 1, 2), include.drift = TRUE)

e_arima_CL.1785_1900 <- Arima(subset(thetimes, year < 1900)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1900)[, c("dummy", "dummy2")]), order = c(2, 1, 2), include.drift = TRUE)

e_arima_CL.1785_1950 <- Arima(subset(thetimes, year < 1950)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1950)[, c("dummy", "dummy2")]), order = c(2, 1, 2), include.drift = TRUE)

e_arima_CL.1785_1975 <- Arima(subset(thetimes, year < 1975)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1975)[, c("dummy", "dummy2")]), order = c(2, 1, 2), include.drift = TRUE)

e_arima_CL.1785_1990 <- Arima(subset(thetimes, year < 1990)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1990)[, c("dummy", "dummy2")]), order = c(2, 1, 2), include.drift = TRUE)

e_arima_CL.1785_2000 <- Arima(subset(thetimes, year < 2000)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 2000)[, c("dummy", "dummy2")]), order = c(2, 1, 2), include.drift = TRUE)

e_arima_CL.1785_2020 <- Arima(subset(thetimes, year < 2021)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 2021)[, c("dummy", "dummy2")]), order = c(2, 1, 2), include.drift = TRUE)

p_arima_CL.1900_2020 <- Arima(subset(thetimes, year > 1900)[, "cl.share"], xreg = as.matrix(subset(thetimes, year > 1900)[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

p_arima_CL.1785_1900 <- Arima(subset(thetimes, year < 1900)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1900)[, c("Penetration", "statistics", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

p_arima_CL.1785_1950 <- Arima(subset(thetimes, year < 1950)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1950)[, c("Penetration", "statistics", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

p_arima_CL.1785_1975 <- Arima(subset(thetimes, year < 1975)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1975)[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

p_arima_CL.1785_1990 <- Arima(subset(thetimes, year < 1990)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 1990)[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

p_arima_CL.1785_2000 <- Arima(subset(thetimes, year < 2000)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 2000)[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)

p_arima_CL.1785_2020 <- Arima(subset(thetimes, year < 2021)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 2021)[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")]), order = c(2, 1, 2), include.drift = TRUE)






aarima_CL.1785_2020 <- auto.arima(subset(thetimes, year < 2021)[, "cl.share"], xreg = as.matrix(subset(thetimes, year < 2021)[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev")]), include.drift = TRUE)

auto.arima(lm_CL1$resid)

df_fc_1785_1900 <- data.frame(forecast(arima_CL.1785_1900, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])))
df_fc_1785_1950 <- data.frame(forecast(arima_CL.1785_1950, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])))
df_fc_1785_1975 <- data.frame(forecast(arima_CL.1785_1975, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])))
df_fc_1785_1990 <- data.frame(forecast(arima_CL.1785_1990, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])))
df_fc_1785_2000 <- data.frame(forecast(arima_CL.1785_2000, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])))
df_fc_1785_2020 <- data.frame(forecast(arima_CL.1785_2020, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])))
df_fc_1900_2020 <- data.frame(forecast(arima_CL.1900_2020, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "ccORG_per_article", "ccPERSON_per_article", "spending_GDP", "gini_rev_i")])))

p.df_fc_1785_1900 <- data.frame(forecast(p_arima_CL.1785_1900, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "statistics", "spending_GDP", "gini_rev_i")])))
p.df_fc_1785_1950 <- data.frame(forecast(p_arima_CL.1785_1950, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "statistics", "spending_GDP", "gini_rev_i")])))
p.df_fc_1785_1975 <- data.frame(forecast(p_arima_CL.1785_1975, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])))
p.df_fc_1785_1990 <- data.frame(forecast(p_arima_CL.1785_1990, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])))
p.df_fc_1785_2000 <- data.frame(forecast(p_arima_CL.1785_2000, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])))
p.df_fc_1785_2020 <- data.frame(forecast(p_arima_CL.1785_2020, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])))
p.df_fc_1900_2020 <- data.frame(forecast(p_arima_CL.1900_2020, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("Penetration", "media_autonomy_i", "statistics", "spending_GDP", "gini_rev_i")])))


e.df_fc_1785_1900 <- data.frame(forecast(e_arima_CL.1785_1900, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("dummy", "dummy2")])))
e.df_fc_1785_1950 <- data.frame(forecast(e_arima_CL.1785_1950, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("dummy", "dummy2")])))
e.df_fc_1785_1975 <- data.frame(forecast(e_arima_CL.1785_1975, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("dummy", "dummy2")])))
e.df_fc_1785_1990 <- data.frame(forecast(e_arima_CL.1785_1990, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("dummy", "dummy2")])))
e.df_fc_1785_2000 <- data.frame(forecast(e_arima_CL.1785_2000, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("dummy", "dummy2")])))
e.df_fc_1785_2020 <- data.frame(forecast(e_arima_CL.1785_2020, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("dummy", "dummy2")])))
e.df_fc_1900_2020 <- data.frame(forecast(e_arima_CL.1900_2020, thetimes[, "cl.share"], xreg = as.matrix(thetimes[, c("dummy", "dummy2")])))

thetimes$afc.1785_1900 <- (df_fc_1785_1900$Point.Forecast)
thetimes$afc.1785_1950 <- (df_fc_1785_1950$Point.Forecast)
thetimes$afc.1785_1975 <- (df_fc_1785_1975$Point.Forecast)
thetimes$afc.1785_1990 <- (df_fc_1785_1990$Point.Forecast)
thetimes$afc.1785_2000 <- (df_fc_1785_2000$Point.Forecast)
thetimes$afc.1785_2020 <- (df_fc_1785_2020$Point.Forecast)
thetimes$afc.1900_2020 <- (df_fc_1900_2020$Point.Forecast)

thetimes$e_afc.1785_1900 <- (e.df_fc_1785_1900$Point.Forecast)
thetimes$e_afc.1785_1950 <- (e.df_fc_1785_1950$Point.Forecast)
thetimes$e_afc.1785_1975 <- (e.df_fc_1785_1975$Point.Forecast)
thetimes$e_afc.1785_1990 <- (e.df_fc_1785_1990$Point.Forecast)
thetimes$e_afc.1785_2000 <- (e.df_fc_1785_2000$Point.Forecast)
thetimes$e_afc.1785_2020 <- (e.df_fc_1785_2020$Point.Forecast)
thetimes$e_afc.1900_2020 <- (e.df_fc_1900_2020$Point.Forecast)

thetimes$p_afc.1785_1900 <- (p.df_fc_1785_1900$Point.Forecast)
thetimes$p_afc.1785_1950 <- (p.df_fc_1785_1950$Point.Forecast)
thetimes$p_afc.1785_1975 <- (p.df_fc_1785_1975$Point.Forecast)
thetimes$p_afc.1785_1990 <- (p.df_fc_1785_1990$Point.Forecast)
thetimes$p_afc.1785_2000 <- (p.df_fc_1785_2000$Point.Forecast)
thetimes$p_afc.1785_2020 <- (p.df_fc_1785_2020$Point.Forecast)
thetimes$p_afc.1900_2020 <- (p.df_fc_1900_2020$Point.Forecast)

arimafc_long <- pivot_longer(thetimes, col = c("afc.1785_1900", "afc.1785_1950", "afc.1785_1975", "afc.1785_1990", "afc.1785_2000", "afc.1785_2020", "afc.1900_2020"))
e_arimafc_long <- pivot_longer(thetimes, col = c("e_afc.1785_1900", "e_afc.1785_1950", "e_afc.1785_1975", "e_afc.1785_1990", "e_afc.1785_2000", "e_afc.1785_2020", "e_afc.1900_2020"))
p_arimafc_long <- pivot_longer(thetimes, col = c("p_afc.1785_1900", "p_afc.1785_1950", "p_afc.1785_1975", "p_afc.1785_1990", "p_afc.1785_2000", "p_afc.1785_2020", "p_afc.1900_2020"))

ggplot(df_fc_1785_1900, aes(y = Point.Forecast, x = 116:351, ymin = Lo.95, ymax = Hi.95)) +
  geom_ribbon(alpha = .25) +
  geom_line()

arimafc_long$value_i <- ifelse(arimafc_long$name == "afc.1900_2020", arimafc_long$value - 0.02750686, arimafc_long$value)

ggplot(arimafc_longX, aes(x = year)) +
  geom_point(aes(y = value), color = "red", shape = 15) +
  geom_point(aes(y = cl.share), color = "black", shape = 18) +
  facet_grid(name ~ .) +
  theme_soft()

arimafc_longX <- arimafc_long[order(arimafc_long$name, arimafc_long$year), ]

p_arimafc_longX <- p_arimafc_long[order(p_arimafc_long$name, p_arimafc_long$year), ]
e_arimafc_longX <- e_arimafc_long[order(e_arimafc_long$name, e_arimafc_long$year), ]

pmeans <- tapply(p_arimafc_longX$value, p_arimafc_longX$name, mean, na.rm = TRUE) - tapply(p_arimafc_longX$cl.share, p_arimafc_longX$name, mean, na.rm = TRUE)
emeans <- tapply(e_arimafc_longX$value, e_arimafc_longX$name, mean, na.rm = TRUE) - tapply(e_arimafc_longX$cl.share, e_arimafc_longX$name, mean, na.rm = TRUE)
xmeans <- tapply(arimafc_longX$value, arimafc_longX$name, mean, na.rm = TRUE) - tapply(arimafc_longX$cl.share, arimafc_longX$name, mean, na.rm = TRUE)

p_arimafc_longX$avg <- as.numeric(pmeans[match(p_arimafc_longX$name, names(pmeans))])
e_arimafc_longX$avg <- as.numeric(pmeans[match(e_arimafc_longX$name, names(emeans))])
arimafc_longX$avg <- as.numeric(pmeans[match(arimafc_longX$name, names(xmeans))])

p_arimafc_longX$value_i <- p_arimafc_longX$value - p_arimafc_longX$avg
e_arimafc_longX$value_i <- e_arimafc_longX$value - e_arimafc_longX$avg
arimafc_longX$value_i <- arimafc_longX$value - arimafc_longX$avg



prediction_longX <- data.frame(
  value = c(prediction_long$value, arimafc_longX$value_i, p_arimafc_longX$value, e_arimafc_longX$value),
  value_i = c(prediction_long$value, arimafc_longX$value_i, p_arimafc_longX$value_i, e_arimafc_longX$value_i),
  year = c(prediction_long$year, arimafc_longX$year, p_arimafc_longX$year, e_arimafc_longX$year),
  cl.share = c(prediction_long$cl.share, arimafc_longX$cl.share, p_arimafc_longX$cl.share, e_arimafc_longX$cl.share),
  train_start = c(prediction_long$train_start, as.numeric(str_extract(arimafc_longX$name, "[:digit:]{4,4}")), as.numeric(str_extract(p_arimafc_longX$name, "[:digit:]{4,4}")), as.numeric(str_extract(e_arimafc_longX$name, "[:digit:]{4,4}"))),
  train_end = c(prediction_long$train_end, as.numeric(str_extract(arimafc_longX$name, "[:digit:]{4,4}$")), as.numeric(str_extract(p_arimafc_longX$name, "[:digit:]{4,4}$")), as.numeric(str_extract(e_arimafc_longX$name, "[:digit:]{4,4}$"))),
  predict_start = c(prediction_long$predict_start, as.numeric(str_extract(arimafc_longX$name, "[:digit:]{4,4}$")), as.numeric(str_extract(p_arimafc_longX$name, "[:digit:]{4,4}$")), as.numeric(str_extract(e_arimafc_longX$name, "[:digit:]{4,4}$"))),
  predict_end = c(prediction_long$predict_end, rep(c(2020), times = c(1180 + 236)), rep(c(1900), times = c(236)), rep(c(2020), times = c(1180 + 236)), rep(c(1900), times = c(236)), rep(c(2020), times = c(1180 + 236)), rep(c(1900), times = c(236))),
  set = c(prediction_long$set, rep("ARIMA_with_ORG", times = 7 * 236), rep("ARIMA_without_ORG", times = 7 * 236), rep("ARIMA_without_predictors", times = 7 * 236)),
  range = c(prediction_long$range, rep(0.01, times = 7 * 236), rep(0.01, times = 7 * 236), rep(0.01, times = 7 * 236)),
  train = c(prediction_long$train, rep(c("1785-1900", "1785-1950", "1785-1975", "1785-1990", "1785-2000", "1785-2020", "1900-2020"), each = 236), rep(c("1785-1900", "1785-1950", "1785-1975", "1785-1990", "1785-2000", "1785-2020", "1900-2020"), each = 236), rep(c("1785-1900", "1785-1950", "1785-1975", "1785-1990", "1785-2000", "1785-2020", "1900-2020"), each = 236))
)

prediction_longX <- prediction_longX[order(prediction_longX$set, prediction_longX$train, prediction_longX$year), ]

prediction_longX$predict_end <- ifelse(prediction_longX$year == 2020, prediction_longX$predict_end, NA)
prediction_longX$train_end <- ifelse(prediction_longX$year == 2020, prediction_longX$train_end, NA)
prediction_longX$range <- 0.01

prediction_longX$predict_start <- ifelse(prediction_longX$set == "ARIMA_with_ORG" & prediction_longX$train == "1900-2020", 1785, prediction_longX$predict_start)
prediction_longX$predict_end <- ifelse(prediction_longX$set == "ARIMA_with_ORG" & prediction_longX$train == "1900-2020", 1900, prediction_longX$predict_end)

prediction_longX$predict_end <- ifelse(prediction_longX$year == 2020, prediction_longX$predict_end, NA)
prediction_longX$train_end <- ifelse(prediction_longX$year == 2020, prediction_longX$train_end, NA)

agg_avg <- aggregate(prediction_longX$value, by = list(prediction_longX$set, prediction_longX$train), median, na.rm = TRUE)
agg_avg$idx <- interaction(agg_avg$Group.1, agg_avg$Group.2)
prediction_longX$idx <- interaction(prediction_longX$set, prediction_longX$train)

prediction_longX$value_avg <- agg_avg[match(prediction_longX$idx, agg_avg$idx), "x"]

prediction_longX$value_x <- prediction_longX$value - prediction_longX$value_avg + mean(prediction_longX$cl.share, na.rm = TRUE)

prediction_longX$TRAIN <- factor(car::Recode(prediction_longX$train, "'1785-1900'='A) 1785-1900';'1785-1950'='B) 1785-1950';'1785-1975'='C) 1785-1975';'1785-1990'='D) 1785-1990';'1785-2000'='E) 1785-2000';'1900-2020'='F) 1900-2020';'1785-2020'='X) 1785-2020'"), ordered = TRUE, levels = c("A) 1785-1900", "B) 1785-1950", "C) 1785-1975", "D) 1785-1990", "E) 1785-2000", "F) 1900-2020", "X) 1785-2020"))

prediction_longX$SET <- factor(
  car::Recode(
    prediction_longX$set,
    "   'ARIMA_with_ORG'            ='T1) ARIMA_T-MG-O';
        'ARIMA_without_ORG'         ='T2) ARIMA_T-MG-.';
        'ARIMA_without_predictors'  ='T3) ARIMA_T-..-.';
        'with_ORG'                  ='T4) Non-ARIMA_.-MG-O';
        'without_ORG'               ='T5) Non-ARIMA_.-MG-.'"
  ),
  ordered = TRUE, levels = c(
    "T1) ARIMA_T-MG-O",
    "T2) ARIMA_T-MG-.",
    "T3) ARIMA_T-..-.",
    "T4) Non-ARIMA_.-MG-O",
    "T5) Non-ARIMA_.-MG-."
  )
)

prediction_longX[with(prediction_longX, (!is.na(train_end) & TRAIN == "F) 1900-2020")), "train_end"] <- 2020
prediction_longX[with(prediction_longX, (!is.na(train_start) & TRAIN == "F) 1900-2020")), "train_start"] <- 1900
prediction_longX[with(prediction_longX, (!is.na(predict_end) & TRAIN == "F) 1900-2020")), "predict_end"] <- 1900
prediction_longX[with(prediction_longX, (!is.na(predict_start) & TRAIN == "F) 1900-2020")), "predict_start"] <- 1785

gg_calibrationX <- ggplot(prediction_longX, aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = value_x)) +
  facet_grid(TRAIN ~ SET, scale = "fixed") +
  theme_soft() +
  ylab("Crisis Labelling Salience") +
  scale_y_continuous() +
  geom_rect(aes(xmin = train_start, xmax = train_end, ymax = range * 10), ymin = 0.01, fill = "darkblue", alpha = .25) +
  geom_rect(aes(xmin = predict_start, xmax = predict_end, ymax = range * 10), ymin = 0.01, fill = "darkred", alpha = .25) +
  ylim(-0.025, 0.1)

ggplot(subset(prediction_longX, set == "ARIMA_with_ORG"), aes(x = year)) +
  geom_point(shape = 15, aes(y = cl.share)) +
  geom_point(color = "red", shape = 18, aes(y = value)) +
  geom_rect(aes(xmin = train_start, xmax = train_end, ymax = range), ymin = 0, fill = "darkblue", alpha = .25) +
  geom_rect(aes(xmin = predict_start, xmax = predict_end, ymax = range), ymin = 0, fill = "darkred", alpha = .25) +
  facet_grid(train ~ set, scale = "free_y") +
  theme_soft() +
  ylab("Crisis Labelling Salience")

ggsave(file = "gg_calibrationX.svg", device = "svg", gg_calibrationX, unit = "cm", width = 12, height = 16, scale = 2.00, dpi = 1200)

rmse_predictions <- data.frame(
  T1 = c(
    sqrt(with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-1900" & year %in% 1900:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-1950" & year %in% 1950:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-1975" & year %in% 1975:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-1990" & year %in% 1990:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-2000" & year %in% 2000:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-2020" & year %in% 1785:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1900-2020" & year %in% 1785:1900), mean(abs(cl.share - value_x)^2)))
  ),
  T2 = c(
    sqrt(with(subset(prediction_longX, set == "ARIMA_without_ORG" & train == "1785-1900" & year %in% 1900:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "ARIMA_without_ORG" & train == "1785-1950" & year %in% 1950:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "ARIMA_without_ORG" & train == "1785-1975" & year %in% 1975:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "ARIMA_without_ORG" & train == "1785-1990" & year %in% 1990:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "ARIMA_without_ORG" & train == "1785-2000" & year %in% 2000:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "ARIMA_without_ORG" & train == "1785-2020" & year %in% 1785:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "ARIMA_without_ORG" & train == "1900-2020" & year %in% 1785:1900), mean(abs(cl.share - value_x)^2)))
  ),
  T3 = c(
    sqrt(with(subset(prediction_longX, set == "ARIMA_without_predictors" & train == "1785-1900" & year %in% 1900:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "ARIMA_without_predictors" & train == "1785-1950" & year %in% 1950:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "ARIMA_without_predictors" & train == "1785-1975" & year %in% 1975:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "ARIMA_without_predictors" & train == "1785-1990" & year %in% 1990:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "ARIMA_without_predictors" & train == "1785-2000" & year %in% 2000:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "ARIMA_without_predictors" & train == "1785-2020" & year %in% 1785:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "ARIMA_without_predictors" & train == "1900-2020" & year %in% 1785:1900), mean(abs(cl.share - value_x)^2)))
  ),
  T4 = c(
    sqrt(with(subset(prediction_longX, set == "with_ORG" & train == "1785-1900" & year %in% 1900:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "with_ORG" & train == "1785-1950" & year %in% 1950:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "with_ORG" & train == "1785-1975" & year %in% 1975:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "with_ORG" & train == "1785-1990" & year %in% 1990:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "with_ORG" & train == "1785-2000" & year %in% 2000:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "with_ORG" & train == "1785-2020" & year %in% 1785:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "with_ORG" & train == "1900-2020" & year %in% 1785:1900), mean(abs(cl.share - value_x)^2)))
  ),
  T5 = c(
    sqrt(with(subset(prediction_longX, set == "without_ORG" & train == "1785-1900" & year %in% 1900:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "without_ORG" & train == "1785-1950" & year %in% 1950:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "without_ORG" & train == "1785-1975" & year %in% 1975:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "without_ORG" & train == "1785-1990" & year %in% 1990:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "without_ORG" & train == "1785-2000" & year %in% 2000:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "without_ORG" & train == "1785-2020" & year %in% 1785:2020), mean(abs(cl.share - value_x)^2))),
    sqrt(with(subset(prediction_longX, set == "without_ORG" & train == "1900-2020" & year %in% 1785:1900), mean(abs(cl.share - value_x)^2)))
  )
)

dict_crisis <- dictionary(list(crisis = c("crisis.*", "crises.*")))
dict_disaster <- dictionary(list(disaster = c("disaster.*")))
dict_collapse <- dictionary(list(collapse = c("collaps.*")))
dict_recession <- dictionary(list(recession = c("recession.*")))
dict_emergency <- dictionary(list(emergency = c("emergency.*", "emergencies.*")))
dict_catastrophe <- dictionary(list(catastrophe = c("catastroph.*")))
dict_epidemic <- dictionary(list(epidemic = c("epidemic.*", "pandemic.*", "", "")))
dict_breakdown <- dictionary(list(breakdown = c("breakdown.*")))
dict_debacle <- dictionary(list(debacle = c("debacl.*")))

kw_crisis <- tokens_lookup(x = cc.tokens, dictionary = dict_crisis, nomatch = "NO")
kw_disaster <- tokens_lookup(x = cc.tokens, dictionary = dict_disaster, nomatch = "NO")
kw_collapse <- tokens_lookup(x = cc.tokens, dictionary = dict_collapse, nomatch = "NO")
kw_recession <- tokens_lookup(x = cc.tokens, dictionary = dict_recession, nomatch = "NO")
kw_emergency <- tokens_lookup(x = cc.tokens, dictionary = dict_emergency, nomatch = "NO")
kw_catastrophe <- tokens_lookup(x = cc.tokens, dictionary = dict_catastrophe, nomatch = "NO")
kw_epidemic <- tokens_lookup(x = cc.tokens, dictionary = dict_epidemic, nomatch = "NO")
kw_breakdown <- tokens_lookup(x = cc.tokens, dictionary = dict_breakdown, nomatch = "NO")
kw_debacle <- tokens_lookup(x = cc.tokens, dictionary = dict_debacle, nomatch = "NO")

kwsum <- function(x, kw) {
  counts <- lapply(x, pattern = kw, FUN = str_count)
  kwcount <- as.numeric(lapply(counts, FUN = sum))
  return(kwcount)
}

kw_crisis_sum <- kwsum(x = kw_crisis, kw = "crisis")
kw_disaster_sum <- kwsum(x = kw_disaster, kw = "disaster")
kw_collapse_sum <- kwsum(x = kw_collapse, kw = "collapse")
kw_recession_sum <- kwsum(x = kw_recession, kw = "recession")
kw_emergency_sum <- kwsum(x = kw_emergency, kw = "emergency")
kw_catastrophe_sum <- kwsum(x = kw_catastrophe, kw = "catastrophe")
kw_epidemic_sum <- kwsum(x = kw_epidemic, kw = "epidemic")
kw_breakdown_sum <- kwsum(x = kw_breakdown, kw = "breakdown")
kw_debacle_sum <- kwsum(x = kw_debacle, kw = "debacle")

prop.table(table(kw_crisis_sum > 0))
prop.table(table(kw_disaster_sum > 0))
prop.table(table(kw_collapse_sum > 0))
prop.table(table(kw_recession_sum > 0))
prop.table(table(kw_emergency_sum > 0))
prop.table(table(kw_catastrophe_sum > 0))
prop.table(table(kw_epidemic_sum > 0))
prop.table(table(kw_breakdown_sum > 0))
prop.table(table(kw_debacle_sum > 0))


with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-1900" & year %in% 1900:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-1950" & year %in% 1950:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-1975" & year %in% 1975:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-1990" & year %in% 1990:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-2000" & year %in% 2000:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1785-2020" & year %in% 1785:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "ARIMA_with_ORG" & train == "1900-2020" & year %in% 1785:1900), cor.test(cl.share, value))$estimate^2

with(subset(prediction_longX, set == "with_ORG" & train == "1785-1900" & year %in% 1900:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "with_ORG" & train == "1785-1950" & year %in% 1950:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "with_ORG" & train == "1785-1975" & year %in% 1975:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "with_ORG" & train == "1785-1990" & year %in% 1990:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "with_ORG" & train == "1785-2000" & year %in% 2000:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "with_ORG" & train == "1785-2020" & year %in% 1785:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "with_ORG" & train == "1900-2020" & year %in% 1785:1900), cor.test(cl.share, value))$estimate^2

with(subset(prediction_longX, set == "without_ORG" & train == "1785-1900" & year %in% 1900:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "without_ORG" & train == "1785-1950" & year %in% 1950:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "without_ORG" & train == "1785-1975" & year %in% 1975:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "without_ORG" & train == "1785-1990" & year %in% 1990:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "without_ORG" & train == "1785-2000" & year %in% 2000:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "without_ORG" & train == "1785-2020" & year %in% 1785:2020), cor.test(cl.share, value))$estimate^2
with(subset(prediction_longX, set == "without_ORG" & train == "1900-2020" & year %in% 1785:1900), cor.test(cl.share, value))$estimate^2


stargazer(lm_CL1, lm_CL2, lm_CL7, lm_CL6, lm_CL5, lm_CL4, lm_CL3, type = "text")

stargazer(lm_CL1X, lm_CL2X, lm_CL7X, lm_CL6X, lm_CL5X, lm_CL4X, lm_CL3X, type = "text")

lm_CLX1 <- lm(cl.share ~ Penetration * media_autonomy_i, data = thetimes)
lm_CLX2 <- lm(cl.share ~ statistics + spending_GDP + gini_rev_i + ministries, data = thetimes)
lm_CLX3 <- lm(cl.share ~ ccORG1000 + ccPERSON_per_article, data = thetimes)
lm_CLX4 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i + ministries, data = thetimes)
lm_CLX5 <- lm(cl.share ~ Penetration * media_autonomy_i + statistics + spending_GDP + gini_rev_i + ministries + ccORG1000 + ccPERSON_per_article, data = thetimes)

stargazer(lm_CLX1, lm_CLX2, lm_CLX3, lm_CLX4, lm_CLX5, type = "text")

ORGlm_3Y <- (lm(ccORG1000 ~ spending_GDP + gini_rev_i + Penetration + media_autonomy_i + 1 + dyear, data = thetimes))

ORGlm_3 <- (lm(ccORG1000 ~ spending_GDP + gini_rev_i + Penetration + media_autonomy_i + 1, data = thetimes))

ORGlm_2Y <- (lm(ccORG1000 ~ spending_GDP + gini_rev_i + 1 + dyear, data = thetimes))

ORGlm_2 <- (lm(ccORG1000 ~ spending_GDP + gini_rev_i + 1, data = thetimes))

ORGlm_1Y <- (lm(ccORG1000 ~ Penetration + media_autonomy_i + 1 + dyear, data = thetimes))

ORGlm_1 <- (lm(ccORG1000 ~ Penetration + media_autonomy_i + 1, data = thetimes))

ORGlm_0Y <- (lm(ccORG1000 ~ 1 + dyear, data = thetimes))

ORGlm_0 <- (lm(ccORG1000 ~ 1, data = thetimes))

stargazer(ORGlm_0, ORGlm_1, ORGlm_2, ORGlm_3, type = "html", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)
stargazer(ORGlm_0Y, ORGlm_1Y, ORGlm_2Y, ORGlm_3Y, type = "html", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)

anova(ORGlm_1, ORGlm_3)
anova(ORGlm_2, ORGlm_3)

thetimes$CLS_100 <- Arima(thetimes$cl.share, order = c(1, 0, 0))$resid

thetimes$CLS_010 <- Arima(thetimes$cl.share, order = c(0, 1, 0))$resid

thetimes$CLS_002 <- Arima(thetimes$cl.share, order = c(0, 0, 2))$resid



CClm_x0 <- (lm(CLS_002 ~ 1, data = thetimes))

CClm_x1 <- (lm(CLS_002 ~ Penetration + media_autonomy_i + 1, data = thetimes))

CClm_x1b <- (lm(CLS_002 ~ spending_GDP + gini_rev_i + 1, data = thetimes))

CClm_x1c <- (lm(CLS_002 ~ ccORG_per_article + 1, data = thetimes))

CClm_x2 <- (lm(CLS_002 ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + 1, data = thetimes))

CClm_x3 <- (lm(CLS_002 ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG_per_article + 1, data = thetimes))


CClm_0 <- (lm(CLS_010 ~ 1, data = thetimes))

CClm_1 <- (lm(CLS_010 ~ Penetration + media_autonomy_i + 1, data = thetimes))

CClm_1b <- (lm(CLS_010 ~ spending_GDP + gini_rev_i + 1, data = thetimes))

CClm_1c <- (lm(CLS_010 ~ ccORG_per_article + 1, data = thetimes))

CClm_2 <- (lm(CLS_010 ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + 1, data = thetimes))

CClm_3 <- (lm(CLS_010 ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG_per_article + 1, data = thetimes))


CClm_0 <- (lm(CLS_100 ~ 1, data = thetimes))

CClm_1 <- (lm(CLS_100 ~ Penetration + media_autonomy_i + 1, data = thetimes))

CClm_1b <- (lm(CLS_100 ~ spending_GDP + gini_rev_i + 1, data = thetimes))

CClm_1c <- (lm(CLS_100 ~ ccORG_per_article + 1, data = thetimes))

CClm_2 <- (lm(CLS_100 ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + 1, data = thetimes))

CClm_3 <- (lm(CLS_100 ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG_per_article + 1, data = thetimes))


thetimes$CL_pcent <- thetimes$cl.share * 100
thetimes$CNW_pcent <- thetimes$cnw.share * 100

thetimes$dyear <- thetimes$year - 1785

CClm_0 <- (lm(CL_pcent ~ 1, data = thetimes))

CClm_1 <- (lm(CL_pcent ~ Penetration + media_autonomy_i + 1, data = thetimes))

CClm_1b <- (lm(CL_pcent ~ spending_GDP + gini_rev_i + 1, data = thetimes))

CClm_1c <- (lm(CL_pcent ~ ccORG1000 + 1, data = thetimes))

CClm_2 <- (lm(CL_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + 1, data = thetimes))

CClm_3 <- (lm(CL_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000 + 1, data = thetimes))

CClm_0Y <- (lm(CL_pcent ~ dyear + 1, data = thetimes))

CClm_1aY <- (lm(CL_pcent ~ Penetration + media_autonomy_i + dyear, data = thetimes))

CClm_1bY <- (lm(CL_pcent ~ spending_GDP + gini_rev_i + dyear, data = thetimes))

CClm_1cY <- (lm(CL_pcent ~ ccORG1000 + dyear, data = thetimes))

CClm_2Y <- (lm(CL_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + 1 + dyear, data = thetimes))

CClm_3Y <- (lm(CL_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000 + 1 + dyear, data = thetimes))

anova(CClm_0, CClm_1)

stargazer(CClm_0, CClm_1, CClm_1b, CClm_1c, CClm_2, CClm_3, type = "text", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)
stargazer(CClm_0Y, CClm_1aY, CClm_1bY, CClm_1cY, CClm_2Y, CClm_3Y, type = "text", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)

anova(CClm_0, CClm_1, CClm_2, CClm_3)

stargazer(ORGlm_0, ORGlm_1, ORGlm_2, ORGlm_3, type = "html", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = TRUE)

stargazer(CClm_0, CClm_1, CClm_1b, CClm_1c, CClm_2, CClm_3, CClm_Y, type = "text")

med_penetration <- mediate(model.m = ORGlm_3, model.y = CClm_3Y, mediator = "ccORG1000", treat = "Penetration")
med_autonomy <- mediate(model.m = ORGlm_3, model.y = CClm_3Y, mediator = "ccORG1000", treat = "media_autonomy_i")
med_spending <- mediate(model.m = ORGlm_3, model.y = CClm_3Y, mediator = "ccORG1000", treat = "spending_GDP")
med_diversity <- mediate(model.m = ORGlm_3, model.y = CClm_3Y, mediator = "ccORG1000", treat = "gini_rev_i")

sensitivity_penetration <- medsens(med_penetration, effect.type = "both")
sensitivity_autonomy <- medsens(med_autonomy, effect.type = "both")
sensitivity_spending <- medsens(med_spending, effect.type = "both")
sensitivity_diversity <- medsens(med_diversity, effect.type = "both")

thetimes$ccORG1000 <- thetimes$ccORG_per_article * 1000
thetimes$logCNW <- log(thetimes$cnw + 1)

lmodel <- "
            CL_pcent ~          1 + d*ccORG1000 + spending_GDP    + gini_rev_i    + Penetration    + media_autonomy_i
            CNW_pcent ~         1 + c*ccORG1000 + spending_GDP    + gini_rev_i    + Penetration    + media_autonomy_i
            logCNW ~                1 + b*ccORG1000 + spending_GDP    + gini_rev_i    + Penetration    + media_autonomy_i
            ccORG1000 ~ 1 +                       a1*spending_GDP + a2*gini_rev_i + a3*Penetration + a4*media_autonomy_i
            intensity_cls:=a1*d
            intensity_cnws:=a1*c
            intensity_cnw:=a1*b
            diversity_cls:=a2*d
            diversity_cnws:=a2*c
            diversity_cnw:=a2*b
            penetration_cls:=a3*d
            penetration_cnws:=a3*c
            penetration_cnw:=a3*b
            autonomy_cls:=a4*d
            autonomy_cnws:=a4*c
            autonomy_cnw:=a4*b
            "

lmodel2 <- "
            CL_pcent ~          1 + d*ccORG1000
            CNW_pcent ~         1 + c*ccORG1000
            logCNW ~                1 + b*ccORG1000
            ccORG1000 ~ 1 +                       a1*spending_GDP + a2*gini_rev_i + a3*Penetration + a4*media_autonomy_i
            intensity_cls:=a1*d
            intensity_cnws:=a1*c
            intensity_cnw:=a1*b
            diversity_cls:=a2*d
            diversity_cnws:=a2*c
            diversity_cnw:=a2*b
            penetration_cls:=a3*d
            penetration_cnws:=a3*c
            penetration_cnw:=a3*b
            autonomy_cls:=a4*d
            autonomy_cnws:=a4*c
            autonomy_cnw:=a4*b
            "

lmodel3 <- "
            CL_pcent ~          spending_GDP    + gini_rev_i    + Penetration    + media_autonomy_i
            CNW_pcent ~         spending_GDP    + gini_rev_i    + Penetration    + media_autonomy_i
            logCNW ~            spending_GDP    + gini_rev_i    + Penetration    + media_autonomy_i
"


lfit <- sem(model = lmodel, data = thetimes)

lfit2 <- sem(model = lmodel2, data = thetimes)
lfit3 <- sem(model = lmodel3, data = thetimes)


CNWlm_0x <- (lm(CNW_pcent ~ 1 + ccORG1000 + articles, data = thetimes))


CNWlm_0 <- (lm(CNW_pcent ~ 1, data = thetimes))

CNWlm_0Y0 <- (lm(CNW_pcent ~ 0 + dyear, data = thetimes))


CNWlm_1a <- (lm(CNW_pcent ~ Penetration + media_autonomy_i, data = thetimes))

CNWlm_1b <- (lm(CNW_pcent ~ spending_GDP + gini_rev_i, data = thetimes))

CNWlm_2 <- (lm(CNW_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes))

CNWlm_3 <- (lm(CNW_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes))


CNWlm_0Y <- (lm(CNW_pcent ~ dyear, data = thetimes))

CNWlm_1aY <- (lm(CNW_pcent ~ Penetration + media_autonomy_i + dyear, data = thetimes))

CNWlm_1bY <- (lm(CNW_pcent ~ spending_GDP + gini_rev_i + dyear, data = thetimes))

CNWlm_2Y <- (lm(CNW_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + dyear, data = thetimes))

CNWlm_3Y <- (lm(CNW_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000 + dyear, data = thetimes))

stargazer(CNWlm_0, CNWlm_1a, CNWlm_1b, CNWlm_2, CNWlm_3, type = "latex", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = FALSE)
stargazer(CNWlm_0Y, CNWlm_1aY, CNWlm_1bY, CNWlm_2Y, CNWlm_3Y, type = "latex", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = FALSE)

stargazer(CNWlm_0, CNWlm_1a, CNWlm_1b, CNWlm_2, CNWlm_3, type = "text", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = FALSE)
stargazer(CNWlm_0Y, CNWlm_1aY, CNWlm_1bY, CNWlm_2Y, CNWlm_3Y, type = "text", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = FALSE)
anova(CNWlm_1, CNWlm_2, CNWlm_3)

stargazer(CNWlm_1, CNWlm_2, CNWlm_3, type = "text")

med_penetration2 <- mediate(model.m = ORGlm_3, model.y = CNWlm_3Y, mediator = "ccORG1000", treat = "Penetration")
med_autonomy2 <- mediate(model.m = ORGlm_3, model.y = CNWlm_3Y, mediator = "ccORG1000", treat = "media_autonomy_i")
med_spending2 <- mediate(model.m = ORGlm_3, model.y = CNWlm_3Y, mediator = "ccORG1000", treat = "spending_GDP")
med_diversity2 <- mediate(model.m = ORGlm_3, model.y = CNWlm_3Y, mediator = "ccORG1000", treat = "gini_rev_i")

sensitivity_penetration2 <- medsens(med_penetration2, effect.type = "both")
sensitivity_autonomy2 <- medsens(med_autonomy2, effect.type = "both")
sensitivity_spending2 <- medsens(med_spending2, effect.type = "both")
sensitivity_diversity2 <- medsens(med_diversity2, effect.type = "both")



thetimes$logCNW <- log(thetimes$cnw + 1)

COUNTlm_0 <- (lm(logCNW ~ 1, data = thetimes))

COUNTlm_1a <- (lm(logCNW ~ Penetration + media_autonomy_i, data = thetimes))

COUNTlm_1b <- (lm(logCNW ~ spending_GDP + gini_rev_i, data = thetimes))

COUNTlm_1c <- (lm(logCNW ~ ccORG1000, data = thetimes))

COUNTlm_2 <- (lm(logCNW ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes))

COUNTlm_3 <- (lm(logCNW ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes))

COUNTlm_0Y <- (lm(logCNW ~ 1 + dyear, data = thetimes))

COUNTlm_1aY <- (lm(logCNW ~ Penetration + media_autonomy_i + dyear, data = thetimes))

COUNTlm_1bY <- (lm(logCNW ~ spending_GDP + gini_rev_i + dyear, data = thetimes))

COUNTlm_1cY <- (lm(logCNW ~ ccORG1000 + dyear, data = thetimes))

COUNTlm_2Y <- (lm(logCNW ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + dyear, data = thetimes))

COUNTlm_3Y <- (lm(logCNW ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000 + dyear, data = thetimes))

COUNTlm_1aQ <- (lm(logCNW ~ Penetration + media_autonomy_alt + dyear, data = thetimes))

COUNTlm_2Q <- (lm(logCNW ~ Penetration + media_autonomy_alt + spending_GDP + gini_rev_i + dyear, data = thetimes))

COUNTlm_3Q <- (lm(logCNW ~ Penetration + media_autonomy_alt + spending_GDP + gini_rev_i + ccORG1000 + dyear, data = thetimes))

stargazer(COUNTlm_1a, COUNTlm_1b, COUNTlm_1c, COUNTlm_2, COUNTlm_3, type = "latex", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = FALSE)

stargazer(COUNTlm_1aY, COUNTlm_1bY, COUNTlm_1cY, COUNTlm_2Y, COUNTlm_3Y, type = "latex", intercept.bottom = FALSE, star.cutoffs = c(.05, .01, .001), single.row = FALSE)




COUNTlm_0 <- (MASS::glm.nb(cnw ~ 1, data = thetimes))

COUNTlm_1 <- (glm.nb(cnw ~ Penetration + media_autonomy_i, data = thetimes))

COUNTlm_1b <- (glm.nb(cnw ~ spending_GDP + gini_rev_i, data = thetimes))

COUNTlm_1c <- (glm.nb(cnw ~ ccORG_per_article, data = thetimes))

COUNTlm_2 <- (glm.nb(cnw ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes))

COUNTlm_3 <- (glm.nb(I(cnw + 1) ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG_per_article, data = thetimes))

COUNTlm_3X <- (glm(cnw ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG_per_article, data = thetimes))


COUNTglm_0 <- (glm(cnw ~ 1, data = thetimes, family = "poisson"))
COUNTglm_1 <- (glm(cnw ~ 1 + Penetration + media_autonomy_i, data = thetimes, family = "poisson"))
COUNTglm_1b <- (glm(cnw ~ 1 + spending_GDP + gini_rev_i, data = thetimes, family = "poisson"))
COUNTglm_1c <- (glm(cnw ~ 1 + ccORG_per_article, data = thetimes, family = "poisson"))
COUNTglm_2 <- (glm(cnw ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes, family = "poisson"))
COUNTglm_3 <- (glm(cnw ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG_per_article, data = thetimes, family = "poisson"))

COUNTglm_3 <- (lm(log(cnw + 1) ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG_per_article, data = thetimes))



COUNTziglm_0 <- (zeroinfl(cnw ~ 1 + dyear, data = thetimes, dist = "poisson"))
COUNTziglm_1 <- (zeroinfl(cnw ~ 1 + Penetration + media_autonomy_i + dyear, data = thetimes, dist = "poisson"))
COUNTziglm_1b <- (zeroinfl(cnw ~ 1 + spending_GDP + gini_rev_i + dyear, data = thetimes, dist = "poisson"))
COUNTziglm_1c <- (zeroinfl(cnw ~ 1 + ccORG_per_article + dyear, data = thetimes, dist = "poisson"))
COUNTziglm_2 <- (zeroinfl(cnw ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + dyear, data = thetimes, dist = "poisson"))
COUNTziglm_3 <- (zeroinfl(cnw ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG_per_article + dyear, data = thetimes, dist = "poisson"))


COUNTlm_0 <- (lm(log(cnw + 1) ~ 1 + year, data = thetimes))
COUNTlm_1 <- (lm(log(cnw + 1) ~ 1 + Penetration + media_autonomy_i + year, data = thetimes))
COUNTlm_1b <- (lm(log(cnw + 1) ~ 1 + spending_GDP + gini_rev_i + year, data = thetimes))
COUNTlm_1c <- (lm(log(cnw + 1) ~ 1 + ccORG_per_article + year, data = thetimes))
COUNTlm_2 <- (lm(log(cnw + 1) ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + year, data = thetimes))
COUNTlm_3 <- (lm(log(cnw + 1) ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG_per_article + year, data = thetimes))



anova(COUNTlm_0, COUNTlm_1)
anova(COUNTlm_0, COUNTlm_1b)
anova(COUNTlm_0, COUNTlm_1c)

anova(COUNTlm_2, COUNTlm_3)

stargazer(COUNTlm_1, COUNTlm_1b, COUNTlm_1c, COUNTlm_2, COUNTlm_3, type = "text")

cor.test(thetimes$CNW, predict.glm(COUNTlm_0))$est^2
cor.test(thetimes$CNW, predict.glm(COUNTlm_1))$est^2
cor.test(thetimes$CNW, predict.glm(COUNTlm_1b))$est^2
cor.test(thetimes$CNW, predict.glm(COUNTlm_1c))$est^2
cor.test(thetimes$CNW, predict.glm(COUNTlm_2))$est^2
cor.test(thetimes$CNW, predict.glm(COUNTlm_3))$est^2

med_penetration3 <- mediate(model.m = ORGlm_3, model.y = COUNTlm_3Y, mediator = "ccORG1000", treat = "Penetration")
med_autonomy3 <- mediate(model.m = ORGlm_3, model.y = COUNTlm_3Y, mediator = "ccORG1000", treat = "media_autonomy_i")
med_spending3 <- mediate(model.m = ORGlm_3, model.y = COUNTlm_3Y, mediator = "ccORG1000", treat = "spending_GDP")
med_diversity3 <- mediate(model.m = ORGlm_3, model.y = COUNTlm_3Y, mediator = "ccORG1000", treat = "gini_rev_i")

med_penetration3X <- mediate(model.m = ORGlm_1, model.y = COUNTlm_3X, mediator = "ccORG1000", treat = "Penetration")
med_autonomy3X <- mediate(model.m = ORGlm_1, model.y = COUNTlm_3X, mediator = "ccORG1000", treat = "media_autonomy_i")
med_spending3X <- mediate(model.m = ORGlm_2, model.y = COUNTlm_3X, mediator = "ccORG1000", treat = "spending_GDP")
med_diversity3X <- mediate(model.m = ORGlm_2, model.y = COUNTlm_3X, mediator = "ccORG1000", treat = "gini_rev_i")


sensitivity_penetration3 <- medsens(med_penetration3, effect.type = "both")
sensitivity_autonomy3 <- medsens(med_autonomy3, effect.type = "both")
sensitivity_spending3 <- medsens(med_spending3, effect.type = "both")
sensitivity_diversity3 <- medsens(med_diversity3, effect.type = "both")


thetimes$CL_z <- 100 * scale(thetimes$CL_pcent, scale = TRUE)
thetimes$CNW_z <- 100 * scale(thetimes$CNW_pcent, scale = TRUE)

thetimes$CL_mmx <- (thetimes$CL_pcent / max(thetimes$CL_pcent, na.rm = TRUE))
thetimes$CNW_mmx <- (thetimes$CNW_pcent / max(thetimes$CNW_pcent, na.rm = TRUE))

thetimes$CL_mm <- 100 * (thetimes$CL_mmx - mean(thetimes$CL_mmx, na.rm = TRUE))
thetimes$CNW_mm <- 100 * (thetimes$CNW_mmx - mean(thetimes$CNW_mmx, na.rm = TRUE))

CNWlm_0z <- (lm(CNW_z ~ 1, data = thetimes))

CNWlm_tz <- (lm(CNW_z ~ dyear, data = thetimes))

CNWlm_1z <- (lm(CNW_z ~ Penetration + media_autonomy_i, data = thetimes))

CNWlm_2z <- (lm(CNW_z ~ spending_GDP + gini_rev_i, data = thetimes))

CNWlm_3z <- (lm(CNW_z ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes))

CNWlm_4z <- (lm(CNW_z ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes))

CNWlm_0mm <- (lm(CNW_mm ~ 1, data = thetimes))

CNWlm_tmm <- (lm(CNW_mm ~ dyear, data = thetimes))

CNWlm_1mm <- (lm(CNW_mm ~ Penetration + media_autonomy_i, data = thetimes))

CNWlm_2mm <- (lm(CNW_mm ~ spending_GDP + gini_rev_i, data = thetimes))

CNWlm_3mm <- (lm(CNW_mm ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes))

CNWlm_4mm <- (lm(CNW_mm ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes))

CLlm_0z <- (lm(CL_z ~ 1, data = thetimes))

CLlm_tz <- (lm(CL_z ~ dyear, data = thetimes))

CLlm_1z <- (lm(CL_z ~ Penetration + media_autonomy_i, data = thetimes))

CLlm_2z <- (lm(CL_z ~ spending_GDP + gini_rev_i, data = thetimes))

CLlm_3z <- (lm(CL_z ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes))

CLlm_4z <- (lm(CL_z ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes))

CLlm_0mm <- (lm(CL_mm ~ 1, data = thetimes))

CLlm_tmm <- (lm(CL_mm ~ dyear, data = thetimes))

CLlm_1mm <- (lm(CL_mm ~ Penetration + media_autonomy_i, data = thetimes))

CLlm_2mm <- (lm(CL_mm ~ spending_GDP + gini_rev_i, data = thetimes))

CLlm_3mm <- (lm(CL_mm ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes))

CLlm_4mm <- (lm(CL_mm ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes))




CLlm_t <- (lm(100 * CL_pcent ~ dyear, data = thetimes))

CLlm_1 <- (lm(100 * CL_pcent ~ Penetration + media_autonomy_i, data = thetimes))

CLlm_2 <- (lm(100 * CL_pcent ~ spending_GDP + gini_rev_i, data = thetimes))

CLlm_3 <- (lm(100 * CL_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes))

CLlm_4 <- (lm(100 * CL_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes))

CNWlm_0 <- (lm(100 * CNW_pcent ~ 1, data = thetimes))

CNWlm_t <- (lm(100 * CNW_pcent ~ dyear, data = thetimes))

CNWlm_1 <- (lm(100 * CNW_pcent ~ Penetration + media_autonomy_i, data = thetimes))

CNWlm_2 <- (lm(100 * CNW_pcent ~ spending_GDP + gini_rev_i, data = thetimes))

CNWlm_3 <- (lm(100 * CNW_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes))

CNWlm_4 <- (lm(100 * CNW_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes))

CNWlm_4x <- (lm(CNW_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes))
CLlm_4x <- (lm(CL_pcent ~ Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes))

CNWlm_1x <- (lm(CNW_pcent ~ Penetration + media_autonomy_i, data = thetimes))
CNWlm_2x <- (lm(CNW_pcent ~ spending_GDP + gini_rev_i, data = thetimes))

etaSquared(CNWlm_1x)
etaSquared(CNWlm_2x)

CLlm_1x <- (lm(CL_pcent ~ Penetration + media_autonomy_i, data = thetimes))
CLlm_2x <- (lm(CL_pcent ~ spending_GDP + gini_rev_i, data = thetimes))

etaSquared(CLlm_1x)
etaSquared(CLlm_2x)

CLlm_tx <- (lm(CL_pcent ~ dyear, data = thetimes))
CNWlm_tx <- (lm(CNW_pcent ~ dyear, data = thetimes))

etaSquared(CLlm_tx)
etaSquared(CNWlm_tx)



thetimes[match(subset(co_vdem, country == "UK" & year < 2021)$year, thetimes$year), "media_autonomy_VDEM"] <- subset(co_vdem, country == "UK" & year < 2021)$media_autonomy

thetimes$media_autonomy_VDEM_i <- na_locf(thetimes$media_autonomy_VDEM, option = "nocb")

thetimes$media_autonomy_VDEM_minmax <- (thetimes$media_autonomy_VDEM_i - min(co_vdem$media_autonomy)) / (max(co_vdem$media_autonomy) - min(co_vdem$media_autonomy))

thetimes$media_autonomy_i_minmax <- (thetimes$media_autonomy_i - min(thetimes$media_autonomy_i)) / (max(thetimes$media_autonomy_i) - min(thetimes$media_autonomy_i))

thetimes$media_autonomy_alt <- (thetimes$media_autonomy_VDEM_minmax + thetimes$media_autonomy_i_minmax) / 2


M4.4 <- lm(ccORG1000 ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes)

M1.1 <- lm(CL_pcent ~ 1, data = thetimes)
M1.2 <- lm(CL_pcent ~ 1 + Penetration + media_autonomy_i, data = thetimes)
M1.3 <- lm(CL_pcent ~ 1 + spending_GDP + gini_rev_i, data = thetimes)
M1.5 <- lm(CL_pcent ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes)
M1.6 <- lm(CL_pcent ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes)

M2.1 <- lm(CNW_pcent ~ 1, data = thetimes)
M2.2 <- lm(CNW_pcent ~ 1 + Penetration + media_autonomy_i, data = thetimes)
M2.3 <- lm(CNW_pcent ~ 1 + spending_GDP + gini_rev_i, data = thetimes)
M2.5 <- lm(CNW_pcent ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes)
M2.6 <- lm(CNW_pcent ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes)

M3.1 <- lm(log(cnw + 1) ~ 1, data = thetimes)
M3.2 <- lm(log(cnw + 1) ~ 1 + Penetration + media_autonomy_i, data = thetimes)
M3.3 <- lm(log(cnw + 1) ~ 1 + spending_GDP + gini_rev_i, data = thetimes)
M3.5 <- lm(log(cnw + 1) ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i, data = thetimes)
M3.6 <- lm(log(cnw + 1) ~ 1 + Penetration + media_autonomy_i + spending_GDP + gini_rev_i + ccORG1000, data = thetimes)

MM1.6_m.aut <- mediation::mediate(model.y = M1.6, model.m = M4.4, treat = "media_autonomy_i", mediator = "ccORG1000")
MM1.6_m.pen <- mediation::mediate(model.y = M1.6, model.m = M4.4, treat = "Penetration", mediator = "ccORG1000")
MM1.6_s.int <- mediation::mediate(model.y = M1.6, model.m = M4.4, treat = "spending_GDP", mediator = "ccORG1000")
MM1.6_s.div <- mediation::mediate(model.y = M1.6, model.m = M4.4, treat = "gini_rev_i", mediator = "ccORG1000")

MM2.6_m.aut <- mediation::mediate(model.y = M2.6, model.m = M4.4, treat = "media_autonomy_i", mediator = "ccORG1000")
MM2.6_m.pen <- mediation::mediate(model.y = M2.6, model.m = M4.4, treat = "Penetration", mediator = "ccORG1000")
MM2.6_s.int <- mediation::mediate(model.y = M2.6, model.m = M4.4, treat = "spending_GDP", mediator = "ccORG1000")
MM2.6_s.div <- mediation::mediate(model.y = M2.6, model.m = M4.4, treat = "gini_rev_i", mediator = "ccORG1000")

MM3.6_m.aut <- mediation::mediate(model.y = M3.6, model.m = M4.4, treat = "media_autonomy_i", mediator = "ccORG1000")
MM3.6_m.pen <- mediation::mediate(model.y = M3.6, model.m = M4.4, treat = "Penetration", mediator = "ccORG1000")
MM3.6_s.int <- mediation::mediate(model.y = M3.6, model.m = M4.4, treat = "spending_GDP", mediator = "ccORG1000")
MM3.6_s.div <- mediation::mediate(model.y = M3.6, model.m = M4.4, treat = "gini_rev_i", mediator = "ccORG1000")

6.2 Comparison crisis and noncrisis corpus

Code
  nc.wfreq <- quanteda.textstats::textstat_frequency(nc.dtm)

  setwd(".//crisis2//")
  setwd(".//crisis2//")

  c.tokens          <-      tokens(cp_red,remove_punct=TRUE)
  # c.tokens            <-      tokens(atok,remove_punct=TRUE)

  c.dtm                 <-      dfm(tokens_remove(c.tokens,pattern=c(stopwords("en"),sw)))

  c.wfreq <- textstat_frequency(c.dtm)

  cc.dtm <- c.dtm
  docvars(c.dtm) <- NULL

  docvars(c.dtm)$id <- docvars(cc.dtm)$id
  docvars(c.dtm)$headline <- docvars(cc.dtm)$headline
  docvars(c.dtm)$type <- "Newspaper"
  docvars(c.dtm)$genre <- docvars(cc.dtm)$genre
  docvars(c.dtm)$newspaper <- docvars(cc.dtm)$newspaper
  docvars(c.dtm)$date <- docvars(cc.dtm)$date
  docvars(c.dtm)$author <- docvars(cc.dtm)$author
  docvars(c.dtm)$source <- docvars(cc.dtm)$archive
  docvars(c.dtm)$year <- docvars(cc.dtm)$year
  docvars(c.dtm)$day <- docvars(cc.dtm)$day
  docvars(c.dtm)$month <- docvars(cc.dtm)$month
  docvars(c.dtm)$Month <- docvars(cc.dtm)$Month
  docvars(c.dtm)$time <- docvars(cc.dtm)$time
  docvars(c.dtm)$Time <- docvars(cc.dtm)$Time
  docvars(c.dtm)$Corpus <- "Crisis"

  # save(c.dtm,file="c_dtm.RData")
  # save(nc.dtm,file="nc_dtm.RData")
  # save(t.dtm,file="t_dtm.RData")


  t.dtm <- rbind(c.dtm,nc.dtm)
  # docvars(t.dtm) <- rbind(
  #                 docvars(c.dtm)[,c("id", "date", "newspaper", "headline", "genre", "author", "type", "year", "month", "day", "time", "Corpus")],
  #                 docvars(nc.dfm)[,c("id", "date", "newspaper", "headline", "genre", "author", "type", "year", "month", "day", "time", "Corpus")])
  docvars(t.dtm) <- rbind(docvars(c.dtm),docvars(nc.dtm))

  t.dtm_1825 <- dfm_subset(t.dtm,year>1784 & year<1826)
  t.dtm_1850 <- dfm_subset(t.dtm,year>1825 & year<1851)
  t.dtm_1875 <- dfm_subset(t.dtm,year>1850 & year<1876)
  t.dtm_1900 <- dfm_subset(t.dtm,year>1875 & year<1901)
  t.dtm_1925 <- dfm_subset(t.dtm,year>1900 & year<1926)
  t.dtm_1950 <- dfm_subset(t.dtm,year>1925 & year<1951)
  t.dtm_1975 <- dfm_subset(t.dtm,year>1950 & year<1976)
  t.dtm_2000 <- dfm_subset(t.dtm,year>1975 & year<2001)
  t.dtm_2020 <- dfm_subset(t.dtm,year>2000 & year<2021)

  # save(t.dtm_1825,"t_dtm_1825.RData")

  ckw_1825 <- textstat_keyness(t.dtm_1825,docvars(t.dtm_1825)$Corpus=='Crisis')
  ckw_1850 <- textstat_keyness(t.dtm_1850,docvars(t.dtm_1850)$Corpus=='Crisis')
  ckw_1875 <- textstat_keyness(t.dtm_1875,docvars(t.dtm_1875)$Corpus=='Crisis')
  ckw_1900 <- textstat_keyness(t.dtm_1900,docvars(t.dtm_1900)$Corpus=='Crisis')
  ckw_1925 <- textstat_keyness(t.dtm_1925,docvars(t.dtm_1925)$Corpus=='Crisis')
  ckw_1950 <- textstat_keyness(t.dtm_1950,docvars(t.dtm_1950)$Corpus=='Crisis')
  ckw_1975 <- textstat_keyness(t.dtm_1975,docvars(t.dtm_1975)$Corpus=='Crisis')
  ckw_2000 <- textstat_keyness(t.dtm_2000,docvars(t.dtm_2000)$Corpus=='Crisis')
  ckw_2020 <- textstat_keyness(t.dtm_2020,docvars(t.dtm_2020)$Corpus=='Crisis')

  g.dtm <- dfm_group(t.dtm,groups=docvars(t.dtm)$Corpus)

  nwords.crisis <- sum(ntoken(c.tokens))
  nwords.noncrisis <- sum(ntoken(nc.tokens))

  crisis_terms <- textstat_keyness(t.dtm,docvars(t.dtm)$Corpus=='Crisis')

  crisis_terms$target_percent <- 100*crisis_terms$n_target/nwords.crisis
  crisis_terms$reference_percent <- 100*crisis_terms$n_reference/nwords.noncrisis

  noncrisis_terms <- textstat_keyness(t.dtm,target="Noncrisis")

  noncrisis_terms$target_percent <- 100*noncrisis_terms$n_target/nwords.crisis
  noncrisis_terms$reference_percent <- 100*noncrisis_terms$n_reference/nwords.noncrisis

  tmlr.crisis <- textmodel_lr(t.dtm,docvars(t.dtm)$Corpus=="Crisis")

  crisis <- c("crisis", "disaster", "catastrophe", "collapse", "epidemic", "pandemic", "debacle", "emergency")

  lss.crisis <- tmod_lss <- textmodel_lss(t.dtm, seed = crisis,
                            k = 300, cache = TRUE)



  file_size_overview <- function(x){
    xx <- get(x)
    xxx <- object.size(xx)
    return(xxx)
    }   

  filelist <- unlist(lapply(ls(),FUN=file_size_overview))
  files <- data.frame(filesize=filelist,rank=rank(filelist),filenames=ls()[c(-121,-122,-120)])


  ordered_filelist <- ls()[order(filelist,decreasing=TRUE)]

    endict <- read.csv(file="ENGVOC_lo.txt")
    endict <- as.character(endict[,1]) # ensure that the vocabulary list is represented as character vector.
    person.names <- as.matrix(babynames::babynames[,3])
    person_names <- tolower(unique(person.names))
    surnames <- tolower(wru::surnames2010[,1])
    fullwordlist <- c(endict,person_names,surnames)

  dictionary1 <- fullwordlist
  dictionary2 <- endict

  ncc_clean <- ncc_all[ncc_all$feature%in%dictionary2,]

  cnc_clean <- cnc_all[cnc_all$n_reference>0,]
  cnc_1800_clean <- cnc_1800[cnc_1800$n_reference>0,]
  cnc_1900_clean <- cnc_1900[cnc_1900$n_reference>0,]
  cnc_1950_clean <- cnc_1950[cnc_1950$n_reference>0,]
  cnc_2000_clean <- cnc_2000[cnc_2000$n_reference>0,]


  crisis_keywords <- c("crisis", "recession", "disaster", "emergency", "collapse", "epidemic", "pandemic", "catastrophe", "breakdown", "debacle")

  cnc_clean$crisis_keyword <- 1*cnc_clean$feature%in%crisis_keywords
  cnc_1800_clean$crisis_keyword <- 1*cnc_1800_clean$feature%in%crisis_keywords
  cnc_1900_clean$crisis_keyword <- 1*cnc_1900_clean$feature%in%crisis_keywords
  cnc_1950_clean$crisis_keyword <- 1*cnc_1950_clean$feature%in%crisis_keywords
  cnc_2000_clean$crisis_keyword <- 1*cnc_2000_clean$feature%in%crisis_keywords

  cnc_clean$fontface <- ifelse(cnc_clean$crisis_keyword==1,"italic", "bold")
  cnc_clean$chi2NA <- ifelse(cnc_clean$crisis_keyword==0,cnc_clean$chi2,NA)
  cnc_1800_clean$fontface <- ifelse(cnc_1800_clean$crisis_keyword==1,"italic", "bold")
  cnc_1800_clean$chi2NA <- ifelse(cnc_1800_clean$crisis_keyword==0,cnc_clean$chi2,NA)
  cnc_1900_clean$fontface <- ifelse(cnc_1900_clean$crisis_keyword==1,"italic", "bold")
  cnc_1900_clean$chi2NA <- ifelse(cnc_1900_clean$crisis_keyword==0,cnc_clean$chi2,NA)
  cnc_1950_clean$fontface <- ifelse(cnc_1950_clean$crisis_keyword==1,"italic", "bold")
  cnc_1950_clean$chi2NA <- ifelse(cnc_1950_clean$crisis_keyword==0,cnc_clean$chi2,NA)
  cnc_2000_clean$fontface <- ifelse(cnc_2000_clean$crisis_keyword==1,"italic", "bold")
  cnc_2000_clean$chi2NA <- ifelse(cnc_2000_clean$crisis_keyword==0,cnc_clean$chi2,NA)

  cnc_1800_clean$rank <- 1:dim(cnc_1800_clean)[1]
  cnc_1900_clean$rank <- 1:dim(cnc_1900_clean)[1]
  cnc_1950_clean$rank <- 1:dim(cnc_1950_clean)[1]
  cnc_2000_clean$rank <- 1:dim(cnc_2000_clean)[1]

  crisis_features <- c(cnc_1800_clean$feature[1:100],cnc_1900_clean$feature[1:100],cnc_1950_clean$feature[1:100],cnc_2000_clean$feature[1:100])

  cnc <- data.frame(feature=unique(crisis_features))
  cnc$r1800 <- cnc_1800_clean[match(cnc$feature,cnc_1800_clean$feature),"rank"]
  cnc$r1900 <- cnc_1900_clean[match(cnc$feature,cnc_1900_clean$feature),"rank"]
  cnc$r1950 <- cnc_1950_clean[match(cnc$feature,cnc_1950_clean$feature),"rank"]
  cnc$r2000 <- cnc_2000_clean[match(cnc$feature,cnc_2000_clean$feature),"rank"]

  cnc$r1800r <- replace(cnc$r1800,(cnc$r1800>100|is.na(cnc$r1800)),101)
  cnc$r1900r <- replace(cnc$r1900,(cnc$r1900>100|is.na(cnc$r1900)),101)
  cnc$r1950r <- replace(cnc$r1950,(cnc$r1950>100|is.na(cnc$r1950)),101)
  cnc$r2000r <- replace(cnc$r2000,(cnc$r2000>100|is.na(cnc$r2000)),101)

  names(cnc) <- c("feature", "r1800", "r1900", "r1950", "r2000", "r1800r", "r1900r", "r1950r", "r2000r")



  ggplot(cnc_clean[1:100,],aes(x=runif(min=0,max=1,100),y=runif(min=0,max=1,100),label=feature,size=10+log(n_target/n_reference),color=log(chi2NA),alpha=log(n_reference)))+geom_text()+scale_color_gradient(low="green",high="red",na.value="grey50")+theme_bluewhite()

  ggplot(cnc_clean[1:100,],aes(x=runif(min=0,max=1,100),y=runif(min=0,max=1,100),label=feature,size=10+log(n_target/n_reference),color=log(chi2NA),alpha=log(n_reference)))+geom_text()+scale_color_viridis(option="turbo",na.value="grey50")+theme_bluewhite()

  crisis_significant_terms <- ggplot(cnc_clean[1:100,],aes(x=runif(min=0.1,max=0.9,100),y=runif(min=0.1,max=0.9,100),label=feature,size=10+log(n_target/n_reference),color=log(chi2NA),fontface=fontface))+geom_text()+scale_color_viridis(option="viridis",end=0.9,na.value="grey50")+theme_bluewhite()+ylab("")+xlab("")+theme(axis.ticks.x = element_blank(),axis.text.x = element_blank(),axis.ticks.y = element_blank(),axis.text.y = element_blank())+guides(color=guide_colorbar("Deviation"),alpha="none",size="none")+xlim(0,1)+ylim(0,1)

  crisis_significant_terms_2000 <- ggplot(cnc_2000_clean[1:100,],aes(x=runif(min=0,max=1,100),y=runif(min=0,max=1,100),label=feature,size=10+log(n_target/n_reference),color=log(chi2NA),fontface=fontface))+geom_text()+scale_color_viridis(option="viridis",end=0.9,na.value="grey50")+theme_bluewhite()+ylab("")+xlab("")+theme(axis.ticks.x = element_blank(),axis.text.x = element_blank(),axis.ticks.y = element_blank(),axis.text.y = element_blank())+guides(color=guide_colorbar("Deviation"),alpha="none",size="none")+xlim(0,1)+ylim(0,1)

  crisis_significant_terms_1950 <- ggplot(cnc_1950_clean[1:100,],aes(x=runif(min=0,max=1,100),y=runif(min=0,max=1,100),label=feature,size=10+log(n_target/n_reference),color=log(chi2NA),fontface=fontface))+geom_text()+scale_color_viridis(option="viridis",end=0.9,na.value="grey50")+theme_bluewhite()+ylab("")+xlab("")+theme(axis.ticks.x = element_blank(),axis.text.x = element_blank(),axis.ticks.y = element_blank(),axis.text.y = element_blank())+guides(color=guide_colorbar("Deviation"),alpha="none",size="none")+xlim(0,1)+ylim(0,1)

  crisis_significant_terms_1900 <- ggplot(cnc_1900_clean[1:100,],aes(x=runif(min=0,max=1,100),y=runif(min=0,max=1,100),label=feature,size=10+log(n_target/n_reference),color=log(chi2NA),fontface=fontface))+geom_text()+scale_color_viridis(option="viridis",end=0.9,na.value="grey50")+theme_bluewhite()+ylab("")+xlab("")+theme(axis.ticks.x = element_blank(),axis.text.x = element_blank(),axis.ticks.y = element_blank(),axis.text.y = element_blank())+guides(color=guide_colorbar("Deviation"),alpha="none",size="none")+xlim(0,1)+ylim(0,1)

  crisis_significant_terms_1800 <- ggplot(cnc_1800_clean[1:100,],aes(x=runif(min=0,max=1,100),y=runif(min=0,max=1,100),label=feature,size=10+log(n_target/n_reference),color=log(chi2NA),fontface=fontface))+geom_text()+scale_color_viridis(option="viridis",end=0.9,na.value="grey50")+theme_bluewhite()+ylab("")+xlab("")+theme(axis.ticks.x = element_blank(),axis.text.x = element_blank(),axis.ticks.y = element_blank(),axis.text.y = element_blank())+guides(color=guide_colorbar("Deviation"),alpha="none",size="none")+xlim(0,1)+ylim(0,1)


  noncrisis_significant_terms <- ggplot(ncc_clean[1:100,],aes(x=runif(min=0,max=1,100),y=runif(min=0,max=1,100),label=feature,size=10+log(n_target/n_reference),color=log(chi2)))+geom_text()+scale_color_viridis(option="viridis",end=0.9,na.value="grey50")+theme_bluewhite()

#  ggplot(ncc_clean[1:300,],aes(x=runif(min=0,max=1,300),y=runif(min=0,max=1,300),label=feature,size=10+log(n_target),color=log(chi2)))+geom_text()+scale_color_viridis(option="viridis",end=0.9,na.value="grey50")+theme_bluewhite()



#  ggplot(cnc_all[1:100,],aes(x=runif(min=0,max=1,100),y=runif(min=0,max=1,100),label=feature,size=10+log(n_target/n_reference),color=log(chi2),alpha=log(n_reference)))+geom_text()+scale_color_gradient(low="green",high="red")+theme(text=element_text(size=20))

#  ggplot(cnc_all[1:300,],aes(x=runif(min=0,max=1,300),y=runif(min=0,max=1,300),label=feature,size=10+log(n_target/n_reference),color=log(chi2),alpha=log(n_reference)))+geom_text()+scale_color_gradient(low="green",high="red")+theme_bluewhite()

#  ggplot(cnc_all[1:300,],aes(x=runif(min=0,max=1,300),y=runif(min=0,max=1,300),label=feature,size=10+log(n_target/n_reference),color=log(chi2),alpha=log(n_reference)))+geom_text()+scale_color_viridis(option="turbo")+theme_bluewhite()

  cnc$id <- 1:dim(cnc)[1]

  cncr <- data.frame(id=cnc$id,feature=cnc$feature,r1800r=as.numeric(unlist(cnc$r1800r)),r1900r=as.numeric(unlist(cnc$r1900r)),r1950r=as.numeric(unlist(cnc$r1950r)),r2000r=as.numeric(unlist(cnc$r2000r)))

# as.matrix(cncr$feature)

  cncr$rem.2000 <- (cncr$r2000r==101 & cncr$r1950r<101)
  cncr$rem.1950 <- (cncr$r1950r==101 & cncr$r1900r<101)
  cncr$rem.1900 <- (cncr$r1900r==101 & cncr$r1800r<101)

  cncr$new.1800 <- (cncr$r1800r<101)
  cncr$new.1900 <- (cncr$r1900r<101) & (cncr$r1800r>100)
  cncr$new.1950 <- (cncr$r1950r<101) & (cncr$r1800r>100) & (cncr$r1900r>100)
  cncr$new.2000 <- (cncr$r2000r<101) & (cncr$r1800r>100) & (cncr$r1900r>100) & (cncr$r1950r>100) 

  cncr$removed <- ifelse(cncr$rem.2000==TRUE,"2000-20",ifelse(cncr$rem.1950==TRUE,"1950-99",ifelse(cncr$rem.1900==TRUE,"1900-49", "NEVER")))
  cncr$new <- ifelse(cncr$new.2000==TRUE,"2000-20",ifelse(cncr$new.1950==TRUE,"1950-99",ifelse(cncr$new.1900,"1900-49", "1800-99")))
  cncr$recurrence <- ifelse(cncr$rem.1900 & (cncr$r1950r<100),"rem.1900_rec.1950",
              ifelse(cncr$rem.1900 & (cncr$r1950r>100 & cncr$r2000r<100),"rem.1900_rec.2000",
                ifelse(cncr$rem.1950 & (cncr$r2000r<100),"rem.1950_rec.2000", "no recurrence")))

  cncr$X <- paste0(ifelse(cncr$r1800r<101,"X", "O"),ifelse(cncr$r1900r<101,"X", "O"),ifelse(cncr$r1950r<101,"X", "O"),ifelse(cncr$r2000r<101,"X", "O"))

  cnc.long <- melt(cncr,id.vars=c("feature"),measure.vars=c("r1800r", "r1900r", "r1950r", "r2000r"))

  cnc.long$r1800r <- rep(cncr$r1800r,times=4)
  cnc.long$r1900r <- rep(cncr$r1900r,times=4)
  cnc.long$r1950r <- rep(cncr$r1950r,times=4)
  cnc.long$r2000r <- rep(cncr$r2000r,times=4)

  cnc.long$best.rank <- rowMins(as.matrix(cbind(cnc.long[,c("r1800r", "r1900r", "r1950r", "r2000r")])),na.rm=TRUE)

#  ggplot(subset(cnc.long,best.rank<21),aes(x=variable,y=replace(value,value>20,21),group=feature,label=feature,color=feature))+geom_line()+geom_point()+geom_text()

  cnc_clean$feature2 <- paste(101:dim(cnc_clean)[1]+100,cnc_clean$feature)
  cnc_clean$id <- 1:dim(cnc_clean)[1]

  ncc_clean$id <- 1:dim(ncc_clean)[1]

# ggplot(cnc_clean[1:50,],aes(y=chi2,x=51-id,label=feature))+geom_col()+geom_text(hjust=0)+theme_bluewhite()+coord_flip()+ylim(0,150000)

# ggplot(ncc_clean[1:50,],aes(y=chi2,x=51-id,label=feature))+geom_col()+geom_text(hjust=0)+theme_bluewhite()+coord_flip()+ylim(-150000,0)

  cnc_clean2 <- subset(cnc_clean,feature%in%dictionary2)

  cnc_clean2_1800 <- subset(cnc_1800_clean,feature%in%dictionary2)
  cnc_clean2_1900 <- subset(cnc_1900_clean,feature%in%dictionary2)
  cnc_clean2_1950 <- subset(cnc_1950_clean,feature%in%dictionary2)
  cnc_clean2_2000 <- subset(cnc_2000_clean,feature%in%dictionary2)

  cnc_clean2_1800$era <- "1800-1899"
  cnc_clean2_1900$era <- "1900-1949"
  cnc_clean2_1950$era <- "1950-1999"
  cnc_clean2_2000$era <- "2000-2020"


  cnc_clean2_1800$target1000 <- 1000*cnc_clean2_1800$n_target/sum(cnc_clean2_1800$n_target)
  cnc_clean2_1800$reference1000 <- 1000*cnc_clean2_1800$n_reference/sum(cnc_clean2_1800$n_reference)
  cnc_clean2_1800$increase <- cnc_clean2_1800$target1000-cnc_clean2_1800$reference1000
  cnc_clean2_1800$ratio <- cnc_clean2_1800$target1000/cnc_clean2_1800$reference1000

  cnc_clean2_1900$target1000 <- 1000*cnc_clean2_1900$n_target/sum(cnc_clean2_1900$n_target)
  cnc_clean2_1900$reference1000 <- 1000*cnc_clean2_1900$n_reference/sum(cnc_clean2_1900$n_reference)
  cnc_clean2_1900$increase <- cnc_clean2_1900$target1000-cnc_clean2_1900$reference1000
  cnc_clean2_1900$ratio <- cnc_clean2_1900$target1000/cnc_clean2_1900$reference1000

  cnc_clean2_1950$target1000 <- 1000*cnc_clean2_1950$n_target/sum(cnc_clean2_1950$n_target)
  cnc_clean2_1950$reference1000 <- 1000*cnc_clean2_1950$n_reference/sum(cnc_clean2_1950$n_reference)
  cnc_clean2_1950$increase <- cnc_clean2_1950$target1000-cnc_clean2_1950$reference1000
  cnc_clean2_1950$ratio <- cnc_clean2_1950$target1000/cnc_clean2_1950$reference1000

  cnc_clean2_2000$target1000 <- 1000*cnc_clean2_2000$n_target/sum(cnc_clean2_2000$n_target)
  cnc_clean2_2000$reference1000 <- 1000*cnc_clean2_2000$n_reference/sum(cnc_clean2_2000$n_reference)
  cnc_clean2_2000$increase <- cnc_clean2_2000$target1000-cnc_clean2_2000$reference1000
  cnc_clean2_2000$ratio <- cnc_clean2_2000$target1000/cnc_clean2_2000$reference1000


  cnc_rep <- rbind(cnc_clean2_1800,cnc_clean2_1900,cnc_clean2_1950,cnc_clean2_2000)

  cnc_top <- subset(cnc_rep,feature%in%cnc.long$feature)
  cnc_top1800 <- subset(cnc_rep,feature%in%cnc_clean2_1800$feature[1:20])
  cnc_top2000 <- subset(cnc_rep,feature%in%cnc_clean2_2000$feature[1:20])

  cnc_top$state <- df_ckw[match(cnc_top$feature,df_ckw$word),"state"]
  cnc_top$executive <- df_ckw[match(cnc_top$feature,df_ckw$word),"executive"]
  cnc_top$emo_pos <- df_ckw[match(cnc_top$feature,df_ckw$word),"emo_pos"]
  cnc_top$emo_neg <- df_ckw[match(cnc_top$feature,df_ckw$word),"emo_neg"]
  cnc_top$finance <- df_ckw[match(cnc_top$feature,df_ckw$word),"finance"]
  cnc_top$external <- df_ckw[match(cnc_top$feature,df_ckw$word),"external"]
  cnc_top$economy <- df_ckw[match(cnc_top$feature,df_ckw$word),"economy"]
  cnc_top$business <- df_ckw[match(cnc_top$feature,df_ckw$word),"business"]
  cnc_top$civic <- df_ckw[match(cnc_top$feature,df_ckw$word),"civic"]
  cnc_top$weness <- df_ckw[match(cnc_top$feature,df_ckw$word),"weness"]
  cnc_top$threat <- df_ckw[match(cnc_top$feature,df_ckw$word),"threat"]
  cnc_top$temporal <- df_ckw[match(cnc_top$feature,df_ckw$word),"temporal"]
  cnc_top$people <- df_ckw[match(cnc_top$feature,df_ckw$word),"people"]
  cnc_top$information <- df_ckw[match(cnc_top$feature,df_ckw$word),"information"]
  cnc_top$action <- df_ckw[match(cnc_top$feature,df_ckw$word),"action"]
  cnc_top$negotiation <- df_ckw[match(cnc_top$feature,df_ckw$word),"negotiation"]
  cnc_top$judgment <- df_ckw[match(cnc_top$feature,df_ckw$word),"judgment"]
  cnc_top$politics <- df_ckw[match(cnc_top$feature,df_ckw$word),"politics"]
  cnc_top$up <- df_ckw[match(cnc_top$feature,df_ckw$word),"up"]
  cnc_top$down <- df_ckw[match(cnc_top$feature,df_ckw$word),"down"]
  cnc_top$development <- df_ckw[match(cnc_top$feature,df_ckw$word),"development"]
  cnc_top$capability <- df_ckw[match(cnc_top$feature,df_ckw$word),"capability"]
  cnc_top$system <- df_ckw[match(cnc_top$feature,df_ckw$word),"system"]
  cnc_top$value <- df_ckw[match(cnc_top$feature,df_ckw$word),"value"]

  categories <- c("state", "executive", "emo_pos", "emo_neg", "finance", "external", "economy", "business", "civic", "weness", "threat", "temporal", "people", "information", "action", "negotiation", "judgment", "politics", "up", "down", "development", "capability", "system", "value")

  cnc_top[cnc_top$feature=="government",categories] <- rep(c(T,T,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,T,F,F,F,F,F,F),each=4)
  cnc_top[cnc_top$feature=="affair",categories] <- rep(c(F,F,F,F,F,F,F,F,F,F,T,F,F,F,F,F,F,F,F,F,F,F,F,F),each=4)
  cnc_top[cnc_top$feature=="world",categories] <- rep(c(F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,T,F,F,F,F,F),each=4)
  cnc_top[cnc_top$feature=="year",categories] <- rep(c(F,F,F,F,F,F,F,F,F,F,F,T,F,F,F,F,F,F,F,F,F,F,F,F),each=4)
  cnc_top[cnc_top$feature=="recognise",categories] <- rep(c(F,F,F,F,F,F,F,F,F,F,F,F,F,T,F,F,T,F,F,F,T,F,F,F),each=4)
  cnc_top[cnc_top$feature=="president",categories] <- rep(c(T,T,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,T,F,F,F,F,F,F),each=4)
  cnc_top[cnc_top$feature=="opinion",categories] <- rep(c(F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,T,F,F,F,F,F,F,F),each=4)
  cnc_top[cnc_top$feature=="nothing",categories] <- rep(c(F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F),each=4)
  cnc_top[cnc_top$feature=="loss",categories] <- rep(c(F,F,F,F,F,F,F,F,F,F,T,F,F,F,F,F,F,F,F,T,F,F,F,F),each=4)
  cnc_top[cnc_top$feature=="labour",categories] <- rep(c(F,F,F,F,F,F,T,T,F,F,F,F,T,F,F,F,F,F,F,F,F,F,F,F),each=4)
  cnc_top[cnc_top$feature=="feel",categories] <- rep(c(F,F,T,T,F,F,F,F,F,F,F,F,F,F,F,F,T,F,F,F,F,F,F,F),each=4)
  cnc_top[cnc_top$feature=="among",categories] <- rep(c(F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F,F),each=4)
  cnc_top[cnc_top$feature=="army",categories] <- rep(c(T,T,F,F,F,F,F,F,F,F,T,F,F,F,F,F,F,F,F,F,F,F,F,F),each=4)

  df_ckw_ratio <- data.frame(rbind(
    data.frame(category=rep("state",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$state,cnc_top$era),sum)),
    data.frame(category=rep("executive",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$executive,cnc_top$era),sum)),
    data.frame(category=rep("emo_pos",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$emo_pos,cnc_top$era),sum)),
    data.frame(category=rep("emo_neg",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$emo_neg,cnc_top$era),sum)),
    data.frame(category=rep("finance",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$finance,cnc_top$era),sum)),
    data.frame(category=rep("external",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$external,cnc_top$era),sum)),
    data.frame(category=rep("economy",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$economy,cnc_top$era),sum)),
    data.frame(category=rep("business",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$business,cnc_top$era),sum)),
    data.frame(category=rep("civic",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$civic,cnc_top$era),sum)),
    data.frame(category=rep("weness",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$weness,cnc_top$era),sum)),
    data.frame(category=rep("threat",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$threat,cnc_top$era),sum)),
    data.frame(category=rep("temporal",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$temporal,cnc_top$era),sum)),
    data.frame(category=rep("people",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$people,cnc_top$era),sum)),
    data.frame(category=rep("information",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$information,cnc_top$era),sum)),
    data.frame(category=rep("action",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$action,cnc_top$era),sum)),
    data.frame(category=rep("negotiation",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$negotiation,cnc_top$era),sum)),
    data.frame(category=rep("judgment",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$judgment,cnc_top$era),sum)),
    data.frame(category=rep("politics",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$politics,cnc_top$era),sum)),
    data.frame(category=rep("up",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$up,cnc_top$era),sum)),
    data.frame(category=rep("down",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$down,cnc_top$era),sum)),
    data.frame(category=rep("development",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$development,cnc_top$era),sum)),
    data.frame(category=rep("capability",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$capability,cnc_top$era),sum)),
    data.frame(category=rep("system",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$system,cnc_top$era),sum)),
    data.frame(category=rep("value",times=8),aggregate(cnc_top$ratio,by=list(cnc_top$value,cnc_top$era),sum))))


  df_ckw_change <- data.frame(rbind(
    data.frame(category=rep("state",times=8),aggregate(cnc_top$increase,by=list(cnc_top$state,cnc_top$era),sum)),
    data.frame(category=rep("executive",times=8),aggregate(cnc_top$increase,by=list(cnc_top$executive,cnc_top$era),sum)),
    data.frame(category=rep("emo_pos",times=8),aggregate(cnc_top$increase,by=list(cnc_top$emo_pos,cnc_top$era),sum)),
    data.frame(category=rep("emo_neg",times=8),aggregate(cnc_top$increase,by=list(cnc_top$emo_neg,cnc_top$era),sum)),
    data.frame(category=rep("finance",times=8),aggregate(cnc_top$increase,by=list(cnc_top$finance,cnc_top$era),sum)),
    data.frame(category=rep("external",times=8),aggregate(cnc_top$increase,by=list(cnc_top$external,cnc_top$era),sum)),
    data.frame(category=rep("economy",times=8),aggregate(cnc_top$increase,by=list(cnc_top$economy,cnc_top$era),sum)),
    data.frame(category=rep("business",times=8),aggregate(cnc_top$increase,by=list(cnc_top$business,cnc_top$era),sum)),
    data.frame(category=rep("civic",times=8),aggregate(cnc_top$increase,by=list(cnc_top$civic,cnc_top$era),sum)),
    data.frame(category=rep("weness",times=8),aggregate(cnc_top$increase,by=list(cnc_top$weness,cnc_top$era),sum)),
    data.frame(category=rep("threat",times=8),aggregate(cnc_top$increase,by=list(cnc_top$threat,cnc_top$era),sum)),
    data.frame(category=rep("temporal",times=8),aggregate(cnc_top$increase,by=list(cnc_top$temporal,cnc_top$era),sum)),
    data.frame(category=rep("people",times=8),aggregate(cnc_top$increase,by=list(cnc_top$people,cnc_top$era),sum)),
    data.frame(category=rep("information",times=8),aggregate(cnc_top$increase,by=list(cnc_top$information,cnc_top$era),sum)),
    data.frame(category=rep("action",times=8),aggregate(cnc_top$increase,by=list(cnc_top$action,cnc_top$era),sum)),
    data.frame(category=rep("negotiation",times=8),aggregate(cnc_top$increase,by=list(cnc_top$negotiation,cnc_top$era),sum)),
    data.frame(category=rep("judgment",times=8),aggregate(cnc_top$increase,by=list(cnc_top$judgment,cnc_top$era),sum)),
    data.frame(category=rep("politics",times=8),aggregate(cnc_top$increase,by=list(cnc_top$politics,cnc_top$era),sum)),
    data.frame(category=rep("up",times=8),aggregate(cnc_top$increase,by=list(cnc_top$up,cnc_top$era),sum)),
    data.frame(category=rep("down",times=8),aggregate(cnc_top$increase,by=list(cnc_top$down,cnc_top$era),sum)),
    data.frame(category=rep("development",times=8),aggregate(cnc_top$increase,by=list(cnc_top$development,cnc_top$era),sum)),
    data.frame(category=rep("capability",times=8),aggregate(cnc_top$increase,by=list(cnc_top$capability,cnc_top$era),sum)),
    data.frame(category=rep("system",times=8),aggregate(cnc_top$increase,by=list(cnc_top$system,cnc_top$era),sum)),
    data.frame(category=rep("value",times=8),aggregate(cnc_top$increase,by=list(cnc_top$value,cnc_top$era),sum))))


  df_ckw_baseline <- data.frame(rbind(
    data.frame(category=rep("state",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$state,cnc_top$era),sum)),
    data.frame(category=rep("executive",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$executive,cnc_top$era),sum)),
    data.frame(category=rep("emo_pos",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$emo_pos,cnc_top$era),sum)),
    data.frame(category=rep("emo_neg",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$emo_neg,cnc_top$era),sum)),
    data.frame(category=rep("finance",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$finance,cnc_top$era),sum)),
    data.frame(category=rep("external",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$external,cnc_top$era),sum)),
    data.frame(category=rep("economy",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$economy,cnc_top$era),sum)),
    data.frame(category=rep("business",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$business,cnc_top$era),sum)),
    data.frame(category=rep("civic",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$civic,cnc_top$era),sum)),
    data.frame(category=rep("weness",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$weness,cnc_top$era),sum)),
    data.frame(category=rep("threat",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$threat,cnc_top$era),sum)),
    data.frame(category=rep("temporal",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$temporal,cnc_top$era),sum)),
    data.frame(category=rep("people",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$people,cnc_top$era),sum)),
    data.frame(category=rep("information",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$information,cnc_top$era),sum)),
    data.frame(category=rep("action",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$action,cnc_top$era),sum)),
    data.frame(category=rep("negotiation",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$negotiation,cnc_top$era),sum)),
    data.frame(category=rep("judgment",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$judgment,cnc_top$era),sum)),
    data.frame(category=rep("politics",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$politics,cnc_top$era),sum)),
    data.frame(category=rep("up",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$up,cnc_top$era),sum)),
    data.frame(category=rep("down",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$down,cnc_top$era),sum)),
    data.frame(category=rep("development",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$development,cnc_top$era),sum)),
    data.frame(category=rep("capability",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$capability,cnc_top$era),sum)),
    data.frame(category=rep("system",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$system,cnc_top$era),sum)),
    data.frame(category=rep("value",times=8),aggregate(cnc_top$reference1000,by=list(cnc_top$value,cnc_top$era),sum))))


  df_ckw_change2 <- pivot_wider(df_ckw_change,names_from=Group.1,values_from=x) 
  df_ckw_change2$share <- (df_ckw_change2$"TRUE"/df_ckw_change2$"FALSE")
  df_ckw_change2$total <- df_ckw_change2$"TRUE"
  df_ckw_change$baseline <- df_ckw_baseline$x
  df_ckw_change$ratio <- df_ckw_ratio$x
  df_ckw_change2$baseline <- subset(df_ckw_change,Group.1==TRUE)$baseline
  df_ckw_change2$ratio <- subset(df_ckw_change,Group.1==TRUE)$ratio
  df_ckw_change2$ratio <- df_ckw_change2$total/(df_ckw_change2$baseline)

#  ggplot(subset(df_ckw_change,Group.1==TRUE),aes(y=x,x=Group.2,group=category,label=category,color=category))+geom_point()+geom_text()+geom_line()+facet_wrap(.~category)+theme_bluewhite()

#  ggplot(df_ckw_change2,aes(y=100*share,x=Group.2,group=category,label=category,color=category))+geom_point()+geom_text()+geom_line()+facet_wrap(.~category)+theme_bluewhite()

  shares_change_by_era <- ggplot(df_ckw_change2,aes(y=100*share,x=Group.2,group=category,label=category,color=category))+geom_point(aes(size=baseline))+geom_line()+geom_hline(yintercept=0,color="red")+facet_wrap(.~category,scales="free_y")+theme_bluewhite()+theme(legend.position="none")+xlab("Era")+ylab("Share category in  total increase in crisis keyword salience")

# shares_change_by_era shows on y: The (sum of increase (in per mille) in frequency) of (all the term falling into the category) in (crisis vis-a-vis non-crisis periods), divided by the total sum of increase across all 226 keywords. The share of total increase.


  total_change_by_era <- ggplot(df_ckw_change2,aes(y=total,x=Group.2,group=category,color=category))+geom_point(aes(size=ratio))+geom_line()+geom_text(aes(y=(-1),label=round(baseline,1)))+geom_text(aes(y=total+3,label=paste0("+",round(total,1))))+geom_hline(yintercept=0,color="red")+facet_wrap(.~category)+theme_bluewhite()+theme(legend.position="none")+xlab("Era")+ylab("Per mille increase in salience crisis keywords belongig to the category")

  total_change_by_fyear <- ggplot(df_ckw_change2,aes(y=total,x=Group.2,group=category,color=category))+geom_point(aes(size=ratio))+geom_line()+geom_text(aes(y=(-1),label=round(baseline,1)))+geom_text(aes(y=total+3,label=paste0("+",round(total,1))))+geom_hline(yintercept=0,color="red")+facet_wrap(.~category)+theme_bluewhite()+theme(legend.position="none")+xlab("Era")+ylab("Per mille increase in salience crisis keywords belongig to the category")


  df_ckw[df_ckw$items==76,"word"] <- "feel"
  df_ckw[df_ckw$items==77,"word"] <- "recognise"
  df_ckw[df_ckw$items==78,"word"] <- "nothing"
  df_ckw[df_ckw$items==79,"word"] <- "opinion"


  dictionary_ckw <- dictionary(list(    state=df_ckw$word[df_ckw$state==1],
                                    executive=df_ckw$word[df_ckw$executive==1],
                                    emo_pos=df_ckw$word[df_ckw$emo_pos==1],
                                    emo_neg=df_ckw$word[df_ckw$emo_neg==1],
                                    finance=df_ckw$word[df_ckw$finance==1],
                                    external=df_ckw$word[df_ckw$external==1],
                                    economy=df_ckw$word[df_ckw$economy==1],
                                    business=df_ckw$word[df_ckw$business==1],
                                    civic=df_ckw$word[df_ckw$civic==1],
                                    weness=df_ckw$word[df_ckw$weness==1],
                                    threat=df_ckw$word[df_ckw$threat==1],
                                    temporal=df_ckw$word[df_ckw$temporal==1],
                                    people=df_ckw$word[df_ckw$people==1],
                                    information=df_ckw$word[df_ckw$information==1],
                                    action=df_ckw$word[df_ckw$action==1],
                                    negotiation=df_ckw$word[df_ckw$negotiation==1],
                                    judgment=df_ckw$word[df_ckw$judgment==1],
                                    politics=df_ckw$word[df_ckw$politics==1],
                                    up=df_ckw$word[df_ckw$up==1],
                                    down=df_ckw$word[df_ckw$down==1],
                                    development=df_ckw$word[df_ckw$development==1],
                                    capability=df_ckw$word[df_ckw$capability==1],
                                    system=df_ckw$word[df_ckw$system==1],
                                    value=df_ckw$word[df_ckw$value==1]))                                    
  ckw_freq <- dfm_lookup(t.dtm,dictionary=dictionary_ckw,nomatch="other")
  ckw_freq_mat <- as.matrix(ckw_freq)
                                    
  df.cnc.terms <- data.frame(year=t.dtm@docvars$year,crisis=t.dtm@docvars$Corpus,
                            ckw_freq_mat)
  df.cnc.terms$total <- rowSums(df.cnc.terms[,3:27],na.rm=TRUE)                         

  df.cnc.terms$state.r <- 1000*df.cnc.terms$state/df.cnc.terms$total
  df.cnc.terms$executive.r <- 1000*df.cnc.terms$executive/df.cnc.terms$total
  df.cnc.terms$emo_pos.r <- 1000*df.cnc.terms$emo_pos/df.cnc.terms$total
  df.cnc.terms$emo_neg.r <- 1000*df.cnc.terms$emo_neg/df.cnc.terms$total
  df.cnc.terms$finance.r <- 1000*df.cnc.terms$finance/df.cnc.terms$total
  df.cnc.terms$external.r <- 1000*df.cnc.terms$external/df.cnc.terms$total
  df.cnc.terms$economy.r <- 1000*df.cnc.terms$economy/df.cnc.terms$total
  df.cnc.terms$business.r <- 1000*df.cnc.terms$business/df.cnc.terms$total
  df.cnc.terms$civic.r <- 1000*df.cnc.terms$civic/df.cnc.terms$total
  df.cnc.terms$weness.r <- 1000*df.cnc.terms$weness/df.cnc.terms$total
  df.cnc.terms$threat.r <- 1000*df.cnc.terms$threat/df.cnc.terms$total
  df.cnc.terms$temporal.r <- 1000*df.cnc.terms$temporal/df.cnc.terms$total
  df.cnc.terms$people.r <- 1000*df.cnc.terms$people/df.cnc.terms$total
  df.cnc.terms$information.r <- 1000*df.cnc.terms$information/df.cnc.terms$total
  df.cnc.terms$action.r <- 1000*df.cnc.terms$action/df.cnc.terms$total
  df.cnc.terms$negotiation.r <- 1000*df.cnc.terms$negotiation/df.cnc.terms$total
  df.cnc.terms$judgment.r <- 1000*df.cnc.terms$judgment/df.cnc.terms$total
  df.cnc.terms$politics.r <- 1000*df.cnc.terms$politics/df.cnc.terms$total
  df.cnc.terms$up.r <- 1000*df.cnc.terms$up/df.cnc.terms$total
  df.cnc.terms$down.r <- 1000*df.cnc.terms$down/df.cnc.terms$total
  df.cnc.terms$development.r <- 1000*df.cnc.terms$development/df.cnc.terms$total
  df.cnc.terms$capability.r <- 1000*df.cnc.terms$capability/df.cnc.terms$total
  df.cnc.terms$system.r <- 1000*df.cnc.terms$system/df.cnc.terms$total
  df.cnc.terms$value.r <- 1000*df.cnc.terms$value/df.cnc.terms$total
  df.cnc.terms$crisis_d <- as.numeric(df.cnc.terms$crisis=="Crisis")
  df.cnc.terms$cyear <- as.numeric(df.cnc.terms$year)-1785
  df.cnc.terms$era <- ifelse(as.numeric(df.cnc.terms$year)<1900,"1785-1899",
              ifelse(as.numeric(df.cnc.terms$year)<1950,"1900-1949",
                ifelse(as.numeric(df.cnc.terms$year)<2000,"1950-1999", "2000-2020")))
  df.cnc.terms$ccent <-  df.cnc.terms$cyear/100
  df.cnc.terms$ccentsq <-  df.cnc.terms$ccent*df.cnc.terms$ccent

  prior <- data.frame(prior=tapply(df.cnc.terms$crisis_d,df.cnc.terms$cyear,mean,na.rm=TRUE))
  prior$cyear <- rownames(prior)
  df.cnc.terms$prior <- prior[match(df.cnc.terms$cyear,prior$cyear),"prior"]

  df.cnc.terms$decade <- 10*floor(as.numeric(df.cnc.terms$year)/10)

  df.cnc.terms$fyear <- factor(df.cnc.terms$year)
  df.cnc.terms$fdecade <- factor(df.cnc.terms$decade)

  glm.crisis <- glm(crisis_d~cyear*(state.r+executive.r+emo_pos.r+emo_neg.r+finance.r+external.r+economy.r+business.r+civic.r+weness.r+threat.r+temporal.r+people.r+information.r+action.r+negotiation.r+judgment.r+politics.r+up.r+down.r+development.r+capability.r+system.r+value.r),data=df.cnc.terms,family=binomial(link="logit"))

  glm.crisis3 <- glm(crisis_d~prior+poly(ccent,2)*(state.r+executive.r+emo_pos.r+emo_neg.r+finance.r+external.r+economy.r+business.r+civic.r+weness.r+threat.r+temporal.r+people.r+information.r+action.r+negotiation.r+judgment.r+politics.r+up.r+down.r+development.r+capability.r+system.r+value.r),data=subset(df.cnc.terms,!is.na(ccent)),family=binomial(link="logit"))

  df.state.r <- data.frame(Effect(glm.crisis3,focal.predictors=c("ccent", "state.r"),xlevels=list(ccent=seq(0,2.35,0.01),state.r=seq(0,50,1))))

  df.threat.r <- data.frame(Effect(glm.crisis3,focal.predictors=c("ccent", "threat.r"),xlevels=list(ccent=seq(0,2.35,0.01),threat.r=seq(0,50,1))))

  df.executive.r <- data.frame(Effect(glm.crisis3,focal.predictors=c("ccent", "executive.r"),xlevels=list(ccent=seq(0,2.35,0.01),executive.r=seq(0,50,1))))

  df.economy.r <- data.frame(Effect(glm.crisis3,focal.predictors=c("ccent", "economy.r"),xlevels=list(ccent=seq(0,2.35,0.01),economy.r=seq(0,50,1))))

  df.finance.r <- data.frame(Effect(glm.crisis3,focal.predictors=c("ccent", "finance.r"),xlevels=list(ccent=seq(0,2.35,0.01),finance.r=seq(0,50,1))))

  glm.crisis_dec <- glm(crisis_d~prior+fdecade*(state.r+executive.r+emo_pos.r+emo_neg.r+finance.r+external.r+economy.r+business.r+civic.r+weness.r+threat.r+temporal.r+people.r+information.r+action.r+negotiation.r+judgment.r+politics.r+up.r+down.r+development.r+capability.r+system.r+value.r),data=subset(df.cnc.terms,!is.na(ccent)),family=binomial(link="logit"))

  df.state.r <- data.frame(Effect(glm.crisis3,focal.predictors=c("fdecade", "state.r"),xlevels=list(state.r=seq(0,50,1))))

  df.threat.r <- data.frame(Effect(glm.crisis3,focal.predictors=c("fdecade", "threat.r"),xlevels=list(ccent=seq(0,2.35,0.01),threat.r=seq(0,50,1))))

  df.executive.r <- data.frame(Effect(glm.crisis3,focal.predictors=c("fdecade", "executive.r"),xlevels=list(ccent=seq(0,2.35,0.01),executive.r=seq(0,50,1))))

  df.economy.r <- data.frame(Effect(glm.crisis3,focal.predictors=c("fdecade", "economy.r"),xlevels=list(ccent=seq(0,2.35,0.01),economy.r=seq(0,50,1))))

  df.finance.r <- data.frame(Effect(glm.crisis3,focal.predictors=c("fdecade", "finance.r"),xlevels=list(finance.r=seq(0,50,1))))

  # ggplot(df.state.r,aes(x=ccent*100+1785,y=state.r,z=fit))+geom_contour_filled()
  # ggplot(df.threat.r,aes(x=ccent*100+1785,y=threat.r,z=fit))+geom_contour_filled()
  # ggplot(df.executive.r,aes(x=ccent*100+1785,y=executive.r,z=fit))+geom_contour_filled()
  # ggplot(df.economy.r,aes(x=ccent*100+1785,y=economy.r,z=fit))+geom_contour_filled()
  # ggplot(df.finance.r,aes(x=ccent*100+1785,y=finance.r,z=fit))+geom_contour_filled()

  # ggplot(df.state.r,aes(x=ccent*100+1785,y=state.r,z=fit))+geom_contour_filled()
  # ggplot(df.threat.r,aes(x=ccent*100+1785,y=threat.r,z=fit))+geom_contour_filled()
  # ggplot(df.executive.r,aes(x=ccent*100+1785,y=executive.r,z=fit))+geom_contour_filled()
  # ggplot(df.economy.r,aes(x=ccent*100+1785,y=economy.r,z=fit))+geom_contour_filled()
  # ggplot(df.finance.r,aes(x=ccent*100+1785,y=finance.r,z=fit))+geom_contour_filled()


  glm.crisis.era <- glm(crisis_d~era*(state.r+executive.r+emo_pos.r+emo_neg.r+finance.r+external.r+economy.r+business.r+civic.r+weness.r+threat.r+temporal.r+people.r+information.r+action.r+negotiation.r+judgment.r+politics.r+up.r+down.r+development.r+capability.r+system.r+value.r),data=df.cnc.terms,family=binomial(link="logit"))

  glm.crisis.fdecade <- glm(crisis_d~fdecade*(state.r+executive.r+emo_pos.r+emo_neg.r+finance.r+external.r+economy.r+business.r+civic.r+weness.r+threat.r+temporal.r+people.r+information.r+action.r+negotiation.r+judgment.r+politics.r+up.r+down.r+development.r+capability.r+system.r+value.r),data=df.cnc.terms,family=binomial(link="logit"))

  glm.crisis.action <- glm(crisis_d~prior+fdecade*(action.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.business <- glm(crisis_d~prior+fdecade*(business.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.capability <- glm(crisis_d~prior+fdecade*(capability.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.civic <- glm(crisis_d~prior+fdecade*(civic.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.development <- glm(crisis_d~prior+fdecade*(development.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.down <- glm(crisis_d~prior+fdecade*(down.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.economy <- glm(crisis_d~prior+fdecade*(economy.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.emo_neg <- glm(crisis_d~prior+fdecade*(emo_neg.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.emo_pos <- glm(crisis_d~prior+fdecade*(emo_pos.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.executive <- glm(crisis_d~prior+fdecade*(executive.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.external <- glm(crisis_d~prior+fdecade*(external.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.finance <- glm(crisis_d~prior+fdecade*(finance.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.information <- glm(crisis_d~prior+fdecade*(information.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.judgment <- glm(crisis_d~prior+fdecade*(judgment.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.negotiation <- glm(crisis_d~prior+fdecade*(negotiation.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.people <- glm(crisis_d~prior+fdecade*(people.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.politics <- glm(crisis_d~prior+fdecade*(politics.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.state <- glm(crisis_d~prior+fdecade*(state.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.system <- glm(crisis_d~prior+fdecade*(system.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.temporal <- glm(crisis_d~prior+fdecade*(temporal.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.threat <- glm(crisis_d~prior+fdecade*(threat.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.up <- glm(crisis_d~prior+fdecade*(up.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.value <- glm(crisis_d~prior+fdecade*(value.r),data=df.cnc.terms,family=binomial(link="logit"))
  glm.crisis.weness <- glm(crisis_d~prior+fdecade*(weness.r),data=df.cnc.terms,family=binomial(link="logit"))

  df_coef2 <- data.frame(
      decade=seq(1780,2020,10),
      action=coef_extr(x=glm.crisis.action,y="action.r"),
      business=coef_extr(x=glm.crisis.business,y="business.r"),
      capability=coef_extr(x=glm.crisis.capability,y="capability.r"),
      civic=coef_extr(x=glm.crisis.civic,y="civic.r"),
      development=coef_extr(x=glm.crisis.development,y="development.r"),
      down=coef_extr(x=glm.crisis.down,y="down.r"),
      economy=coef_extr(x=glm.crisis.economy,y="economy.r"),
      emo_neg=coef_extr(x=glm.crisis.emo_neg,y="emo_neg.r"),
      emo_pos=coef_extr(x=glm.crisis.emo_pos,y="emo_pos.r"),
      executive=coef_extr(x=glm.crisis.executive,y="executive.r"),
      external=coef_extr(x=glm.crisis.external,y="external.r"),
      finance=coef_extr(x=glm.crisis.finance,y="finance.r"),
      information=coef_extr(x=glm.crisis.information,y="information.r"),
      judgment=coef_extr(x=glm.crisis.judgment,y="judgment.r"),
      negotiation=coef_extr(x=glm.crisis.negotiation,y="negotiation.r"),
      people=coef_extr(x=glm.crisis.people,y="people.r"),
      politics=coef_extr(x=glm.crisis.politics,y="politics.r"),
      state=coef_extr(x=glm.crisis.state,y="state.r"),
      system=coef_extr(x=glm.crisis.system,y="system.r"),
      temporal=coef_extr(x=glm.crisis.temporal,y="temporal.r"),
      threat=coef_extr(x=glm.crisis.threat,y="threat.r"),
      up=coef_extr(x=glm.crisis.up,y="up.r"),
      value=coef_extr(x=glm.crisis.value,y="value.r"),
      weness=coef_extr(x=glm.crisis.weness,y="weness.r"))

  domains <- c("action", "business", "capability", "civic", "development", "down", "economy", "emo_neg", "emo_pos", "executive", "external", "finance", "information", "judgment", "negotiation", "people", "politics", "state", "system", "temporal", "threat", "up", "value", "weness")

  dfl_coef <- pivot_longer(df_coef2,cols=domains)

  crisis_vocab <- ggplot(subset(dfl_coef,!(decade==2020)),aes(y=value,x=decade))+geom_point(size=3.5,shape=16,fill="white",color="#20a387")+geom_smooth(span=0.5,color="#20a387",fill="#20a387")+geom_point(size=1.5,shape=16,fill="white",color="white")+geom_hline(yintercept=0,color="#D55E00",linetype="dashed")+facet_wrap(.~name,scale="free_y")+theme_soft()

  # ggsave(crisis_vocab,file="crisis_vocab.svg",device="svg",unit="cm",height=10,width=20,dpi=1200,scale=2.0)

  # ggsave(crisis_vocab,file="crisis_vocab.png",device="png",unit="cm",height=12,width=20,dpi=1200,scale=1.75)

  df.state <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "state.r")))
  df.executive <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "executive.r")))
  df.politics <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "politics.r")))
  df.capability <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "capability.r")))
  df.system <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "system.r")))
  df.civic <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "civic.r")))
  df.up <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "up.r")))
  df.down <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "down.r")))
  df.value <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "value.r")))
  df.emo_pos <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "emo_pos.r")))
  df.emo_neg <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "emo_neg.r")))
  df.judgment <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "judgment.r")))
  df.negotiation <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "negotiation.r")))
  df.information <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "information.r")))
  df.action <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "action.r")))
  df.development <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "development.r")))
  df.temporal <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "temporal.r")))
  df.finance <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "finance.r")))
  df.economy <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "economy.r")))
  df.business <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "business.r")))
  df.external <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "external.r")))
  df.threat <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "threat.r")))
  df.weness <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "weness.r")))
  df.people <- data.frame(Effect(glm.crisis.fdecade,focal.predictors=c("fdecade", "people.r")))

  df.state <- data.frame(Effect(glm.crisis.state,focal.predictors=c("fdecade", "state.r")))
  df.executive <- data.frame(Effect(glm.crisis.executive,focal.predictors=c("fdecade", "executive.r")))
  df.politics <- data.frame(Effect(glm.crisis.politics,focal.predictors=c("fdecade", "politics.r")))
  df.capability <- data.frame(Effect(glm.crisis.capability,focal.predictors=c("fdecade", "capability.r")))
  df.system <- data.frame(Effect(glm.crisis.system,focal.predictors=c("fdecade", "system.r")))
  df.civic <- data.frame(Effect(glm.crisis.civic,focal.predictors=c("fdecade", "civic.r")))
  df.up <- data.frame(Effect(glm.crisis.up,focal.predictors=c("fdecade", "up.r")))
  df.down <- data.frame(Effect(glm.crisis.down,focal.predictors=c("fdecade", "down.r")))
  df.value <- data.frame(Effect(glm.crisis.value,focal.predictors=c("fdecade", "value.r")))
  df.emo_pos <- data.frame(Effect(glm.crisis.emo_pos,focal.predictors=c("fdecade", "emo_pos.r")))
  df.emo_neg <- data.frame(Effect(glm.crisis.emo_neg,focal.predictors=c("fdecade", "emo_neg.r")))
  df.judgment <- data.frame(Effect(glm.crisis.judgment,focal.predictors=c("fdecade", "judgment.r")))
  df.negotiation <- data.frame(Effect(glm.crisis.negotiation,focal.predictors=c("fdecade", "negotiation.r")))
  df.information <- data.frame(Effect(glm.crisis.information,focal.predictors=c("fdecade", "information.r")))
  df.action <- data.frame(Effect(glm.crisis.action,focal.predictors=c("fdecade", "action.r")))
  df.development <- data.frame(Effect(glm.crisis.development,focal.predictors=c("fdecade", "development.r")))
  df.temporal <- data.frame(Effect(glm.crisis.temporal,focal.predictors=c("fdecade", "temporal.r")))
  df.finance <- data.frame(Effect(glm.crisis.finance,focal.predictors=c("fdecade", "finance.r")))
  df.economy <- data.frame(Effect(glm.crisis.economy,focal.predictors=c("fdecade", "economy.r")))
  df.business <- data.frame(Effect(glm.crisis.business,focal.predictors=c("fdecade", "business.r")))
  df.external <- data.frame(Effect(glm.crisis.external,focal.predictors=c("fdecade", "external.r")))
  df.threat <- data.frame(Effect(glm.crisis.threat,focal.predictors=c("fdecade", "threat.r")))
  df.weness <- data.frame(Effect(glm.crisis.weness,focal.predictors=c("fdecade", "weness.r")))
  df.people <- data.frame(Effect(glm.crisis.people,focal.predictors=c("fdecade", "people.r")))

  mat.glm <- mapply(c,df.state,df.executive,df.politics,df.capability,df.system,df.civic,df.up,df.down,df.value,df.emo_pos,df.emo_neg,df.judgment,df.negotiation,df.information,df.action, df.development,df.temporal,df.finance,df.economy,df.business,df.external,df.threat,df.weness,df.people)
  df.glm <- data.frame(mat.glm)
  names(df.glm) <- c("decade", "rfreq", "fit", "se", "lower", "upper")
  df.glm$decade <- rep(df.state$fdecade,times=24)

  df.glm$category <- rep(c( "state", "executive", "politics", "capability", "system", "civic", "up", "down", "value", "emo_pos",
              "emo_neg", "judgment", "negotiation", "information", "action", "development", "temporal", "finance", "economy", "business",
              "external", "threat", "weness", "people"),each=125)
                        
  df.coef <- data.frame(coef=coef(glm.crisis_dec))
  df.coef$term <- rownames(df.coef)

  rate_terms <- c(  "state.r", "executive.r", "politics.r", "capability.r", "system.r", "civic.r", "up.r", "down.r", "value.r", "emo_pos.r",
              "emo_neg.r", "judgment.r", "negotiation.r", "information.r", "action.r", "development.r", "temporal.r", "finance.r", "economy.r", "business.r",
              "external.r", "threat.r", "weness.r", "people.r")

  main_decades_loc <- str_extract(df.coef$term,"fdecade[[:digit:]]{4,4}$")
  inter_decades_loc <- str_extract(df.coef$term,"fdecade[[:digit:]]{4,4}.*r$")
  all_decades_loc <- str_extract(df.coef$term,"fdecade[[:digit:]]{4,4}.*$")
  main_rates <- df.coef$term[c(rep(NA,times=26),27:50,rep(NA,times=576))]
  inter_rates <- df.coef$term[c(rep(NA,times=50),51:626)]
  all_rates <- df.coef$term[c(rep(NA,times=26),27:626)]

  tcoef <- data.frame(decade=seq(1790,2020,10),
    state=df.coef$coef[which(main_rates=="state.r")]+df.coef$coef[which(str_detect(inter_rates,"state.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    executive=df.coef$coef[which(main_rates=="executive.r")]+df.coef$coef[which(str_detect(inter_rates,"executive.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    politics=df.coef$coef[which(main_rates=="politics.r")]+df.coef$coef[which(str_detect(inter_rates,"politics.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    capability=df.coef$coef[which(main_rates=="capability.r")]+df.coef$coef[which(str_detect(inter_rates,"capability.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    system=df.coef$coef[which(main_rates=="system.r")]+df.coef$coef[which(str_detect(inter_rates,"system.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    civic=df.coef$coef[which(main_rates=="civic.r")]+df.coef$coef[which(str_detect(inter_rates,"civic.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    up=df.coef$coef[which(main_rates=="up.r")]+df.coef$coef[which(str_detect(inter_rates,"up.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    down=df.coef$coef[which(main_rates=="down.r")]+df.coef$coef[which(str_detect(inter_rates,"down.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    value=df.coef$coef[which(main_rates=="value.r")]+df.coef$coef[which(str_detect(inter_rates,"value.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    emo_pos=df.coef$coef[which(main_rates=="emo_pos.r")]+df.coef$coef[which(str_detect(inter_rates,"emo_pos.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    emo_neg=df.coef$coef[which(main_rates=="emo_neg.r")]+df.coef$coef[which(str_detect(inter_rates,"emo_neg.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    finance=df.coef$coef[which(main_rates=="finance.r")]+df.coef$coef[which(str_detect(inter_rates,"finance.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    external=df.coef$coef[which(main_rates=="external.r")]+df.coef$coef[which(str_detect(inter_rates,"external.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    economy=df.coef$coef[which(main_rates=="economy.r")]+df.coef$coef[which(str_detect(inter_rates,"economy.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    business=df.coef$coef[which(main_rates=="business.r")]+df.coef$coef[which(str_detect(inter_rates,"business.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    judgment=df.coef$coef[which(main_rates=="judgment.r")]+df.coef$coef[which(str_detect(inter_rates,"judgment.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    negotiation=df.coef$coef[which(main_rates=="negotiation.r")]+df.coef$coef[which(str_detect(inter_rates,"negotiation.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    information=df.coef$coef[which(main_rates=="information.r")]+df.coef$coef[which(str_detect(inter_rates,"information.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    action=df.coef$coef[which(main_rates=="action.r")]+df.coef$coef[which(str_detect(inter_rates,"action.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    development=df.coef$coef[which(main_rates=="development.r")]+df.coef$coef[which(str_detect(inter_rates,"development.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    temporal=df.coef$coef[which(main_rates=="temporal.r")]+df.coef$coef[which(str_detect(inter_rates,"temporal.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    threat=df.coef$coef[which(main_rates=="threat.r")]+df.coef$coef[which(str_detect(inter_rates,"threat.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    people=df.coef$coef[which(main_rates=="people.r")]+df.coef$coef[which(str_detect(inter_rates,"people.r"))]+df.coef$coef[!is.na(main_decades_loc)],
    weness=df.coef$coef[which(main_rates=="weness.r")]+df.coef$coef[which(str_detect(inter_rates,"weness.r"))]+df.coef$coef[!is.na(main_decades_loc)]
    )
    
  tcoef <- data.frame(decade=seq(1790,2020,10),
    state=df.coef$coef[which(main_rates=="state.r")]+df.coef$coef[which(str_detect(inter_rates,"state.r"))] ,
    executive=df.coef$coef[which(main_rates=="executive.r")]+df.coef$coef[which(str_detect(inter_rates,"executive.r"))] ,
    politics=df.coef$coef[which(main_rates=="politics.r")]+df.coef$coef[which(str_detect(inter_rates,"politics.r"))] ,
    capability=df.coef$coef[which(main_rates=="capability.r")]+df.coef$coef[which(str_detect(inter_rates,"capability.r"))] ,
    system=df.coef$coef[which(main_rates=="system.r")]+df.coef$coef[which(str_detect(inter_rates,"system.r"))] ,
    civic=df.coef$coef[which(main_rates=="civic.r")]+df.coef$coef[which(str_detect(inter_rates,"civic.r"))] ,
    up=df.coef$coef[which(main_rates=="up.r")]+df.coef$coef[which(str_detect(inter_rates,"up.r"))] ,
    down=df.coef$coef[which(main_rates=="down.r")]+df.coef$coef[which(str_detect(inter_rates,"down.r"))] ,
    value=df.coef$coef[which(main_rates=="value.r")]+df.coef$coef[which(str_detect(inter_rates,"value.r"))] ,
    emo_pos=df.coef$coef[which(main_rates=="emo_pos.r")]+df.coef$coef[which(str_detect(inter_rates,"emo_pos.r"))] ,
    emo_neg=df.coef$coef[which(main_rates=="emo_neg.r")]+df.coef$coef[which(str_detect(inter_rates,"emo_neg.r"))] ,
    finance=df.coef$coef[which(main_rates=="finance.r")]+df.coef$coef[which(str_detect(inter_rates,"finance.r"))] ,
    external=df.coef$coef[which(main_rates=="external.r")]+df.coef$coef[which(str_detect(inter_rates,"external.r"))] ,
    economy=df.coef$coef[which(main_rates=="economy.r")]+df.coef$coef[which(str_detect(inter_rates,"economy.r"))] ,
    business=df.coef$coef[which(main_rates=="business.r")]+df.coef$coef[which(str_detect(inter_rates,"business.r"))] ,
    judgment=df.coef$coef[which(main_rates=="judgment.r")]+df.coef$coef[which(str_detect(inter_rates,"judgment.r"))] ,
    negotiation=df.coef$coef[which(main_rates=="negotiation.r")]+df.coef$coef[which(str_detect(inter_rates,"negotiation.r"))] ,
    information=df.coef$coef[which(main_rates=="information.r")]+df.coef$coef[which(str_detect(inter_rates,"information.r"))] ,
    action=df.coef$coef[which(main_rates=="action.r")]+df.coef$coef[which(str_detect(inter_rates,"action.r"))] ,
    development=df.coef$coef[which(main_rates=="development.r")]+df.coef$coef[which(str_detect(inter_rates,"development.r"))] ,
    temporal=df.coef$coef[which(main_rates=="temporal.r")]+df.coef$coef[which(str_detect(inter_rates,"temporal.r"))] ,
    threat=df.coef$coef[which(main_rates=="threat.r")]+df.coef$coef[which(str_detect(inter_rates,"threat.r"))] ,
    people=df.coef$coef[which(main_rates=="people.r")]+df.coef$coef[which(str_detect(inter_rates,"people.r"))] ,
    weness=df.coef$coef[which(main_rates=="weness.r")]+df.coef$coef[which(str_detect(inter_rates,"weness.r"))] 
    )
    
  tcoef2 <- pivot_longer(tcoef,cols=str_remove(rate_terms,"\\.r$"))

  # ggplot(subset(tcoef2,!(decade==1790&name=="system")),aes(x=decade,y=value))+geom_col(color="#20a387")+geom_point(color="white",size=1.5)+geom_hline(yintercept=0,color="red",linetype="dashed")+facet_wrap(.~name,scales="free_y")+theme_soft()+scale_color_viridis(option="viridis",begin=.05,end=.95)

  crisis_vocab <- ggplot(subset(tcoef2,(!decade==2020)&(!(decade==1790&name=="system"))),aes(x=decade,y=value))+geom_smooth(color="#20a387",span=1)+geom_point(color="#20a387",size=3.5)+geom_point(color="white",size=1.5)+geom_hline(yintercept=0,color="red",linetype="dashed")+facet_wrap(.~name,scales="free_y")+theme_soft()+scale_color_viridis(option="viridis",begin=.05,end=.95)+xlab("Decade")+ylab("Coefficient \n (coefficient > 0: presence of word increases the chance of article being part of the crisis corpus) \n (coefficient < 0: presence of word increases the chance of article being part of the routine corpus)")

  # ggsave(crisis_vocab,file="crisis_vocab.svg",device="svg",unit="cm",height=16,width=20,dpi=1200,scale=1.75)

  # ggsave(crisis_vocab,file="crisis_vocab.png",device="png",unit="cm",height=16,width=20,dpi=1200,scale=1.75)

  # ggsave(crisis_vocab,file="crisis_vocab.jpg",device="jpg",unit="cm",height=12,width=24,dpi=300,scale=1.75)


  df.state <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "state.r")))
  df.executive <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "executive.r")))
  df.politics <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "politics.r")))
  df.capability <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "capability.r")))
  df.system <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "system.r")))
  df.civic <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "civic.r")))
  df.up <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "up.r")))
  df.down <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "down.r")))
  df.value <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "value.r")))
  df.emo_pos <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "emo_pos.r")))
  df.emo_neg <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "emo_neg.r")))
  df.judgment <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "judgment.r")))
  df.negotiation <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "negotiation.r")))
  df.information <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "information.r")))
  df.action <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "action.r")))
  df.development <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "development.r")))
  df.temporal <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "temporal.r")))
  df.finance <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "finance.r")))
  df.economy <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "economy.r")))
  df.business <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "business.r")))
  df.external <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "external.r")))
  df.threat <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "threat.r")))
  df.weness <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "weness.r")))
  df.people <- data.frame(Effect(glm.crisis.era,focal.predictors=c("era", "people.r")))

  mat.glm <- mapply(c,df.state,df.executive,df.politics,df.capability,df.system,df.civic,df.up,df.down,df.value,df.emo_pos,df.emo_neg,df.judgment,df.negotiation,df.information,df.action, df.development,df.temporal,df.finance,df.economy,df.business,df.external,df.threat,df.weness,df.people)


  df.glm <- data.frame(mat.glm)
  names(df.glm) <- c("era", "rfreq", "fit", "se", "lower", "upper")
  df.glm$era <- rep(df.state$era,times=24)

  df.glm$category <- rep(c( "state", "executive", "politics", "capability", "system", "civic", "up", "down", "value", "emo_pos",
              "emo_neg", "judgment", "negotiation", "information", "action", "development", "temporal", "finance", "economy", "business",
              "external", "threat", "weness", "people"),each=20)

  categories.r <- paste0(categories,".r")

  glm_ckw <- list()
  glm_ckw2 <- list()
  df_ckw_eff <- list()
  df_ckw_eff2 <- list()
  glm_ckw_era <- list()
  df_ckw_eff_era <- list()

  year_a_priori <- prop.table(with(df.cnc.terms,table(crisis,year)),2)[1,]

  df.cnc.terms$prior <- year_a_priori[match(df.cnc.terms$year,names(year_a_priori))]

  quantiles.r <- matrix(NA,nrow=length(categories.r),ncol=21)

  for (i in 1:length(categories.r))
    {
      quantiles.r[i,] <- quantile(df.cnc.terms[[categories.r[[i]]]],probs=seq(0,1,.05),na.rm=TRUE)
      colnames(quantiles.r) <- names(quantile(df.cnc.terms[[categories.r[[i]]]],probs=seq(0,1,.05),na.rm=TRUE))
    }


  for (i in 1:length(categories.r))
    {
      print(i)
      flush.console()
      x <- (categories.r[i])
      df.cnc.terms$x <- df.cnc.terms[,x]
      glm_ckw_era[[categories[i]]] <- glm(crisis_d~prior+era*x,data=df.cnc.terms,family=binomial(link="logit"))
      df_ckw_eff_era[[categories[i]]] <- data.frame(Effect(mod=glm_ckw_era[[i]],focal.predictors=c("era", "x"),xlevels=list(x=seq(0,100,5))))
    }


  for (i in 1:length(categories))
    {
      x <- (categories.r[i])
      df.cnc.terms$x <- df.cnc.terms[,x]
      glm_ckw[[categories[i]]] <- glm(crisis_d~prior+fdecade*x,data=df.cnc.terms,family=binomial(link="logit"))
      df_ckw_eff[[categories[i]]] <- data.frame(Effect(mod=glm_ckw[[i]],focal.predictors=c("fdecade", "x"),xlevels=list(x=seq(0,100,10))))
    }


  for (i in 1:length(categories))
    {
      print(i)
      flush.console()
      x <- (categories.r[i])
      df.cnc.terms$x <- df.cnc.terms[,x]
      glm_ckw2[[categories[i]]] <- glm(crisis_d~prior+fdecade*x,data=df.cnc.terms,family=binomial(link="logit"))
      df_ckw_eff2[[categories[i]]] <- data.frame(Effect(mod=glm_ckw2[[i]],focal.predictors=c("fdecade", "x"),xlevels=list(x=c(0,quantiles.r[i,"50%"],quantiles.r[i,"75%"],quantiles.r[i,"95%"]))))
    }


  gg_state <- ggplot(df_ckw_eff[[1]],aes(y=fit,ymin=lower,ymax=upper,x=x,fill=era,color=era,group=era))+geom_ribbon(alpha=.3)+geom_line()+scale_color_viridis_d(begin=0,end=.85)+scale_fill_viridis_d(begin=0,end=.85)

  gg_executive <- ggplot(df_ckw_eff[[2]],aes(y=fit,ymin=lower,ymax=upper,x=x,fill=era,color=era,group=era))+geom_ribbon(alpha=.3)+geom_line()+scale_color_viridis_d(begin=0,end=.85)+scale_fill_viridis_d(begin=0,end=.85)

  gg_emo_pos <- ggplot(df_ckw_eff[[3]],aes(y=fit,ymin=lower,ymax=upper,x=x,fill=era,color=era,group=era))+geom_ribbon(alpha=.3)+geom_line()+scale_color_viridis_d(begin=0,end=.85)+scale_fill_viridis_d(begin=0,end=.85)

  gg_emo_neg <- ggplot(df_ckw_eff[[4]],aes(y=fit,ymin=lower,ymax=upper,x=x,fill=era,color=era,group=era))+geom_ribbon(alpha=.3)+geom_line()+scale_color_viridis_d(begin=0,end=.85)+scale_fill_viridis_d(begin=0,end=.85)

  gg_finance <- ggplot(df_ckw_eff[[5]],aes(y=fit,ymin=lower,ymax=upper,x=x,fill=era,color=era,group=era))+geom_ribbon(alpha=.3)+geom_line()+scale_color_viridis_d(begin=0,end=.85)+scale_fill_viridis_d(begin=0,end=.85)

  gg_external <- ggplot(df_ckw_eff[[6]],aes(y=fit,ymin=lower,ymax=upper,x=x,fill=era,color=era,group=era))+geom_ribbon(alpha=.3)+geom_line()+scale_color_viridis_d(begin=0,end=.85)+scale_fill_viridis_d(begin=0,end=.85)

  gg_economy <- ggplot(df_ckw_eff[[7]],aes(y=fit,ymin=lower,ymax=upper,x=x,fill=era,color=era,group=era))+geom_ribbon(alpha=.3)+geom_line()+scale_color_viridis_d(begin=0,end=.85)+scale_fill_viridis_d(begin=0,end=.85)

  gg_business <- ggplot(df_ckw_eff[[8]],aes(y=fit,ymin=lower,ymax=upper,x=x,fill=era,color=era,group=era))+geom_ribbon(alpha=.3)+geom_line()+scale_color_viridis_d(begin=0,end=.85)+scale_fill_viridis_d(begin=0,end=.85)



  for (i in 1:length(categories))
    {
      df_ckw_eff[[categories[i]]] <- data.frame(Effect(glm_ckw[["get(categories[i])"]],focal.predictors=c("era", "get(categories[i])")))
    }

  df.ckw.eff_era <- do.call("rbind",df_ckw_eff_era)
  df.ckw.eff_era$category <- str_remove_all(rownames(df.ckw.eff_era),pattern="[:punct:]|[:digit:]")
  rownames(quantiles.r) <- (categories)

  df.ckw.eff_era$category <- replace(df.ckw.eff_era$category,df.ckw.eff_era$category=="emopos", "emo_pos")
  df.ckw.eff_era$category <- replace(df.ckw.eff_era$category,df.ckw.eff_era$category=="emoneg", "emo_neg")

  df.quantiles <- data.frame(quantiles.r)
  names(df.quantiles) <- paste0("Q",seq(0,100,5))
  df.quantiles$category <- rownames(quantiles.r)

  quantiles_long <- pivot_longer(df.quantiles,cols=paste0("Q",seq(0,100,5)))

  for (i in 1:length(categories))
    {
      df.ckw.eff_era[df.ckw.eff_era$category==categories[[i]],"Q50"] <- subset(quantiles_long,name=="Q50" & category==categories[[i]])$value
      df.ckw.eff_era[df.ckw.eff_era$category==categories[[i]],"Q75"] <- subset(quantiles_long,name=="Q75" & category==categories[[i]])$value
      df.ckw.eff_era[df.ckw.eff_era$category==categories[[i]],"Q90"] <- subset(quantiles_long,name=="Q90" & category==categories[[i]])$value
      df.ckw.eff_era[df.ckw.eff_era$category==categories[[i]],"Q95"] <- subset(quantiles_long,name=="Q95" & category==categories[[i]])$value
      df.ckw.eff_era[df.ckw.eff_era$category==categories[[i]],"Q100"] <- subset(quantiles_long,name=="Q100" & category==categories[[i]])$value
    }


  # ggplot(df.ckw.eff_era,aes(x=x,y=fit,ymin=lower,ymax=upper,color=era,fill=era,group=era,shape=era))+geom_point()+geom_line()+geom_ribbon(alpha=.2)+facet_wrap(.~category)+theme_soft()+scale_color_viridis_d(option="mako",end=.9,begin=.1,direction=-1)+scale_fill_viridis_d(option="mako",end=.9,begin=.1,direction=-1)+geom_vline(aes(xintercept=Q50),linetype="solid")+geom_vline(aes(xintercept=Q75),linetype="longdash")+geom_vline(aes(xintercept=Q90),linetype="dashed")+geom_vline(aes(xintercept=Q95),linetype="dotted")



  df.ckw.eff <- do.call("rbind",df_ckw_eff)
  df.ckw.eff$category <- str_remove_all(rownames(df.ckw.eff),pattern="[:punct:]|[:digit:]")
  df.ckw.eff$era <- factor(Recode(df.ckw.eff$fdecade,"'1780'='1785-1899';'1790'='1785-1899';'1800'='1785-1899';'1810'='1785-1899';'1820'='1785-1899';'1830'='1785-1899';'1840'='1785-1899';'1850'='1785-1899';'1860'='1785-1899';'1870'='1785-1899';'1880'='1785-1899';'1890'='1785-1899';'1900'='1900-1949';'1910'='1900-1949';'1920'='1900-1949';'1930'='1900-1949';'1940'='1900-1949';'1950'='1950-1989';'1960'='1950-1989';'1970'='1950-1989';'1980'='1950-1989';'1990'='1990-2019';'2000'='1990-2019';'2010'='1990-2019'"),ordered=TRUE)

  # ggplot(subset(df.ckw.eff,fdecade!="2020"),aes(x=x,y=fit,ymin=lower,ymax=upper,color=era,fill=era,group=fdecade,shape=era))+geom_point()+geom_line()+facet_wrap(.~category)+theme_soft()+scale_color_viridis_d(option="viridis",end=.9,begin=.1)

  df.state <- data.frame(Effect(glm.state,focal.predictors=c("era", "state.r"),xlevels=list(state.r=seq(0,100,10))))

                        
  # ggplot(df.glm,aes(fill=category,y=fit,ymin=lower,ymax=upper,x=rfreq))+geom_ribbon(alpha=.3)+facet_grid(category~era,scales="free")

  # ggplot(subset(df.glm,category=="state"),aes(color=era,fill=era,y=fit,ymin=lower,ymax=upper,x=rfreq))+geom_ribbon(alpha=.3)+facet_grid(.~era)
  # ggplot(subset(df.glm,category=="politics"),aes(color=era,fill=era,y=fit,ymin=lower,ymax=upper,x=rfreq))+geom_ribbon(alpha=.3)+facet_grid(.~era)
  # ggplot(subset(df.glm,category=="executive"),aes(color=era,fill=era,y=fit,ymin=lower,ymax=upper,x=rfreq))+geom_ribbon(alpha=.3)+facet_grid(.~era)


  categories <- c("state", "executive", "emo_pos", "emo_neg", "finance", "external", "economy", "business", "civic", "weness", "threat", "temporal", "people", "information", "action", "negotiation", "judgment", "politics", "up", "down", "development", "capability", "system", "value")


  # ggplot(cnc_top,aes(y=increase,x=era,fontface=fontface,color=state))+geom_point()



  cnc_top$ft.class <- Recode(cnc_top$feature,"'government'='GOV';'much'='QUAL';'crisis'='KEY';'today'='TIME';'disaster'='KEY';'man'='GEN';'minister'='GOV';'state'='GOV';'people'='GEN';'can'='CAP';'cholera'='EPI';'epidemic'='EPI';'present'='TIME';'make'='CAP';'question'='AMB';'yesterday'='TIME';'power'='CAP';'seem'='AMB';'war'='VIO';'policy'='GOV';'however'='REA';'one'='GEN';'force'='CAP';'may'='AMB';'time'='TIME';'ministry'='GOV';'party'='POL';'even'='QUAL';'cause'='REA';'country'='GEO';'must'='URG';'cabinet'='GOV';'come'='???';'position'='CAP';'political'='POL';'russia'='GEO';'take'='???';'russian'='GEO';'occur'='REA';'little'='QUAL';'result'='CAP';'last'='TIME';'upon'='???';'great'='QUAL';'president'='GOV';'bulgaria'='GEO';'moment'='TIME';'course'='DEV';'believe'='REA';'report'='REP';'still'='DEV';'body'='VIO';'cheer'='EVAL';'affair'='EVAL';'vessel'='OBJ';'boat'='OBJ';'catastrophe'='KEY';'danger'='REA';'point'='???';'many'='QUAL';'telegram'='NEWS';'fact'='REP';'regard'='REA';'now'='TIME';'remain'='DEV';'possible'='CAP';'meet'='TALK';'explosion'='VIO';'difficulty'='CAP';'almost'='QUAL';'authority'='GOV';'majority'='POL';'without'='???';'feel'='REA';'recognise'='REA';'nothing'='???';'opinion'='REA';'become'='DEV';'correspondent'='NEWS';'conference'='TALK';'reach'='QUAL';'official'='GOV';'measure'='CAP';'interest'='POL';'accident'='VIO';'train'='OBJ';'army'='VIO';'mess'='QUAL';'troop'='VIO';'among'='???';'yet'='DEV';'though'='REA';'disease'='EPI';'france'='GEO';'attack'='VIO';'find'='REA';'turkish'='GEO';'serious'='QUAL';'nation'='POL';'see'='REA';
  'british'='GEO';'european'='GEO';'strike'='ACT';'mean'='REA';'fall'='QUAL';'national'='GEO';'go'='ACT';'already'='DEV';'far'='???';'military'='VIO';'tomorrow'='TIME';
  'general'='GEN';'europe'='GEO';'effort'='CAP';'hope'='EMO';'case'='REP';'work'='CAP';'bring'='CAP';'loss'='QUAL';'peace'='VIO';'expect'='REA';'public'='GEN';
  'organisation'='ORG';'us'='GEN';'leader'='GOV';'support'='CAP';'since'='TIME';'increase'='DEV';'message'='NEWS';'fear'='EMO';'continue'='DEV';'realise'='REA';'unite'='CAP';'financial'='ECO';'miner'='VIC';'foreign'='GEO';'action'='CAP';'collapse'='KEY';'million'='ECO';'germany'='GEO';'effect'='CAP';'likely'='AMB';'tragedy'='KEY';'accord'='TALK';'demand'='TALK';'union'='GEO';'german'='GEO';'chief'='GOV';'finance'='ECO';'committee'='REP';'election'='GOV';'bank'='ECO';'executive'='GOV';'deal'='TALK';'world'='GEO';'system'='CAP';'warn'='REA';'need'='QUAL';'bad'='QUAL';'emergency'='KEY';'labour'='POL';'face'='REA';'fail'='REA';'industry'='ECO';'risk'='REA';'community'='GEN';'currency'='ECO';'week'='TIME';'also'='REA';'international'='GEO';'month'='TIME';'problem'='QUAL';'britain'='GEO';'health'='VALUE';else=NA")


  # ggplot(subset(cnc_top,!is.na(ft.class)),aes(y=increase,x=era,group=ft.class,color=ft.class))+geom_point()+geom_smooth(se=FALSE)+theme(legend.position="none")+facet_wrap(.~ft.class)

  # ggplot(subset(cnc_top,!is.na(ft.class)),aes(y=ratio,x=era,group=ft.class,color=ft.class))+geom_point()+geom_smooth(se=FALSE)+theme(legend.position="none")+facet_wrap(.~ft.class)

  # ggplot(cnc_top,aes(label=feature,y=increase,x=era,group=feature,color=feature))+geom_point()+geom_line()+theme(legend.position="none")+geom_text()

  # ggplot(subset(cnc_top,ft.class=="VIO"),aes(label=feature,y=increase,x=era,group=feature,color=feature))+geom_point(aes(size=reference1000))+geom_line()+theme(legend.position="none")+geom_text(nudge_y=0.025)+geom_text(aes(label=round(reference1000,2)),nudge_y=0.025,nudge_x=0.2,color="black")+geom_text(aes(label=round(target1000,2)),nudge_y=0.025,nudge_x=0.4,color="red")+theme_bluewhite()

  # ggplot(subset(cnc_top,ft.class=="GOV"),aes(label=feature,y=increase,x=era,group=feature,color=feature))+geom_point()+geom_line()+theme(legend.position="none")+geom_text(nudge_y=0.025)+geom_text(aes(label=round(reference1000,2)),nudge_y=0.025,nudge_x=0.2,color="black")+geom_text(aes(label=round(target1000,2)),nudge_y=0.025,nudge_x=0.4,color="red")+theme_bluewhite()

  # ggplot(subset(cnc_top,ft.class=="REA"),aes(label=feature,y=increase,x=era,group=feature,color=feature))+geom_point()+geom_line()+theme(legend.position="none")+geom_text(nudge_y=0.025)+geom_text(aes(label=round(reference1000,2)),nudge_y=0.025,nudge_x=0.2,color="black")+geom_text(aes(label=round(target1000,2)),nudge_y=0.025,nudge_x=0.4,color="red")+theme_bluewhite()

  # ggplot(subset(cnc_top,ft.class=="CAP"),aes(label=feature,y=increase,x=era,group=feature,color=feature))+geom_point()+geom_line()+theme(legend.position="none")+geom_text(nudge_y=0.025)+geom_text(aes(label=round(reference1000,2)),nudge_y=0.025,nudge_x=0.2,color="black")+geom_text(aes(label=round(target1000,2)),nudge_y=0.025,nudge_x=0.4,color="red")+theme_bluewhite()

  # ggplot(subset(cnc_top,ft.class=="TALK"),aes(label=feature,y=increase,x=era,group=feature,color=feature))+geom_point()+geom_line()+theme(legend.position="none")+geom_text(nudge_y=0.025)+geom_text(aes(label=round(reference1000,2)),nudge_y=0.025,nudge_x=0.2,color="black")+geom_text(aes(label=round(target1000,2)),nudge_y=0.025,nudge_x=0.4,color="red")+theme_bluewhite()+ylim(0,0.75)


  # ggplot(subset(cnc_top,ft.class=="EVAL"),aes(label=feature,y=increase,x=era,group=feature,color=feature))+geom_point()+geom_line()+theme(legend.position="none")+geom_text()

  # cnc_clean2_2000
  # ggplot(cnc_clean2_2000[1:50,],aes(y=target1000,x=rank,fill=ratio))+geom_col()+coord_flip()+geom_col(aes(y=reference1000),fill="grey",alpha=.5)+geom_text(aes(label=feature),hjust=0,vjust=0.25)
  # ggplot(cnc_clean2_1950[1:50,],aes(y=target1000,x=rank,fill=ratio))+geom_col()+coord_flip()+geom_col(aes(y=reference1000),fill="grey",alpha=.5)+geom_text(aes(label=feature),hjust=0,vjust=0.25)

  textplot_keyness(subset(cnc_clean2,fontface=="bold"),n=40)+theme_bluewhite() 

  textplot_keyness(cnc_clean2_1800,n=40)+theme_bluewhite() 
  textplot_keyness(cnc_clean2_1900,n=40)+theme_bluewhite() 
  textplot_keyness(cnc_clean2_1950,n=40)+theme_bluewhite() 
  textplot_keyness(cnc_clean2_2000,n=40)+theme_bluewhite() 

  cnc_top$ft.class <- Recode(cnc_top$feature,"'government'='GOV';'much'='QUAL';'crisis'='KEY';'today'='TIME';'disaster'='KEY';'man'='GEN';'minister'='GOV';'state'='GOV';
  'people'='GEN';'can'='CAP';'cholera'='EPI';'epidemic'='EPI';'present'='TIME';'make'='CAP';'question'='AMB';'yesterday'='TIME';'power'='CAP';'seem'='AMB';'war'='VIO';
  'policy'='GOV';'however'='REA';'one'='GEN';'force'='CAP';'may'='AMB';'time'='TIME';'ministry'='GOV';'party'='POL';'even'='QUAL';'cause'='REA';'country'='GEO';'must'='URG';
  'cabinet'='GOV';'come'='???';'position'='CAP';'political'='POL';'russia'='GEO';'take'='???';'russian'='GEO';'occur'='REA';'little'='QUAL';'result'='CAP';'last'='TIME';
  'upon'='???';'great'='QUAL';'president'='GOV';'bulgaria'='GEO';'moment'='TIME';'course'='DEV';'believe'='REA';'report'='REP';'still'='DEV';'body'='VIO';'cheer'='EVAL';
  'affair'='EVAL';'vessel'='OBJ';'boat'='OBJ';'catastrophe'='KEY';'danger'='REA';'point'='???';'many'='QUAL';'telegram'='NEWS';'fact'='REP';'regard'='REA';'now'='TIME';
  'remain'='DEV';'possible'='CAP';'meet'='TALK';'explosion'='VIO';'difficulty'='CAP';'almost'='QUAL';'authority'='GOV';'majority'='POL';'without'='???';'feel'='REA';
  'recognise'='REA';'nothing'='???';'opinion'='REA';'become'='DEV';'correspondent'='NEWS';'conference'='TALK';'reach'='QUAL';'official'='GOV';'measure'='CAP';
  'interest'='POL';'accident'='VIO';'train'='OBJ';'army'='VIO';'mess'='QUAL';'troop'='VIO';'among'='???';'yet'='DEV';'though'='REA';'disease'='EPI';'france'='GEO';
  'attack'='VIO';'find'='REA';'turkish'='GEO';'serious'='QUAL';'nation'='POL';'see'='REA';
  'british'='GEO';'european'='GEO';'strike'='ACT';'mean'='REA';'fall'='QUAL';'national'='GEO';'go'='ACT';'already'='DEV';'far'='???';'military'='VIO';'tomorrow'='TIME';
  'general'='GEN';'europe'='GEO';'effort'='CAP';'hope'='EMO';'case'='REP';'work'='CAP';'bring'='CAP';'loss'='QUAL';'peace'='VIO';'expect'='REA';'public'='GEN';
  'organisation'='ORG';'us'='GEN';'leader'='GOV';'support'='CAP';'since'='TIME';'increase'='DEV';'message'='NEWS';'fear'='EMO';'continue'='DEV';'realise'='REA';'unite'='CAP';
  'financial'='ECO';'miner'='VIC';'foreign'='GEO';'action'='CAP';'collapse'='KEY';'million'='ECO';'germany'='GEO';'effect'='CAP';'likely'='AMB';'tragedy'='KEY';'accord'='TALK';
  'demand'='TALK';'union'='GEO';'german'='GEO';'chief'='GOV';'finance'='ECO';'committee'='REP';'election'='GOV';'bank'='ECO';'executive'='GOV';'deal'='TALK';'world'='GEO';
  'system'='CAP';'warn'='REA';'need'='QUAL';'bad'='QUAL';'emergency'='KEY';'labour'='POL';'face'='REA';'fail'='REA';'industry'='ECO';'risk'='REA';'community'='GEN';
  'currency'='ECO';'week'='TIME';'also'='REA';'international'='GEO';'month'='TIME';'problem'='QUAL';'britain'='GEO';'health'='VALUE';''='';''='';''='';")


  crisiscomp <- dictionary(list(    power=c("government", "minister", "state", "national", "public", "leader", "power", "policy", "ministry", "party", "president", "cabinet", "position", "political"),
                                quantity=c("much", "far", "one", "great", "little", ""),
                                people=c("people", "man", "us", "miner"),
                                alarm=c("crisis", "disaster", "fall", "loss", "increase", "", "", ""),
                                danger=c("cholera", "military", "peace", "epidemic", "war"),
                                time=c("today", "already", "tomorrow", "since", "present", "yesterday", "time", "moment", "last", "still"),
                                money=c("financial", "bank"),
                                negotiation=c("strike"),
                                message=c("news", "message", "report"),
                                reasoning=c("however", "question", "may", "even", "cause", "occur"),
                                capability=c("can", "go", "effort", "work", "bring", "support", "continue", "unite", "make", "force", "must", "come", "take", "result"),
                                mindset=c("hope", "expect", "fear", "realise", "seem", "believe")
                                
  table(docvars(x.dfm)$year,docvars(x.dfm)$corp)

  gov <- dfm_lookup(x=t.dtm,dictionary=dictionary(list(government="government")))
  cri <- dfm_lookup(x=t.dtm,dictionary=dictionary(list(crisis="crisis")))

  df.gov <- data.frame(cri=as.numeric(cri[,1]),gov=as.numeric(gov[,1]),year=docvars(t.dtm)$year,crisis=as.numeric(docvars(t.dtm)$Corpus=="Crisis"))
  baseline <- table(docvars(t.dtm)$year,docvars(t.dtm)$Corpus)[,1]/rowSums(table(docvars(t.dtm)$year,docvars(t.dtm)$Corpus))
  df.gov$baseline <- baseline[match(df.gov$year,names(baseline))]

  df.gov$year_num <- as.numeric(df.gov$year)-1784
  df.gov$gov_bin <- as.numeric(df.gov$gov>0)
  df.gov$gov_bin_f <- factor(df.gov$gov_bin,levels=c(0,1),ordered=TRUE)
  df.gov$cri_bin <- as.numeric(df.gov$cri>0)
  df.gov$cri_bin_f <- factor(df.gov$cri_bin,levels=c(0,1),ordered=TRUE)

  lda.gov <- lda(crisis~gov_bin_f*year_num,data=df.gov)
  glm.gov <- glm(crisis~baseline+gov_bin_f*year_num+cri_bin_f*year_num,data=df.gov,family=binomial(link="logit"))

  df.eff.gov <- data.frame(Effect(glm.gov,focal.predictors=c("year_num", "gov_bin_f")))
  df.eff.cri <- data.frame(Effect(glm.gov,focal.predictors=c("year_num", "cri_bin_f")))

  #  ggplot(df.eff.gov,aes(y=fit,ymin=lower,ymax=upper,x=year_num,color=gov_bin_f,fill=gov_bin_f,shape=gov_bin_f,group=gov_bin_f))+geom_point(size=5)+geom_ribbon(alpha=.1)+scale_fill_viridis_d(begin=0.1,end=0.9)+scale_color_viridis_d(begin=0.1,end=0.9)+ylim(0,1)
  #  ggplot(df.eff.cri,aes(y=fit,ymin=lower,ymax=upper,x=year_num,color=cri_bin_f,fill=cri_bin_f,shape=cri_bin_f,group=cri_bin_f))+geom_point(size=5)+geom_ribbon(alpha=.1)+scale_fill_viridis_d(begin=0.1,end=0.9)+scale_color_viridis_d(begin=0.1,end=0.9)+ylim(0,1)

  df.gov2 <- glm.gov$model
  df.gov2$crisis.res.time <- glm.gov$resid

  glm.gov.red <- lm(crisis.res.time~gov_bin_f*year_num,data=df.gov2)

  df.eff.gov2 <- data.frame(Effect(glm.gov.red,focal.predictors=c("year_num", "gov_bin_f")))

  # ggplot(df.eff.gov2,aes(y=fit,ymin=lower,ymax=upper,x=year_num,color=gov_bin_f,fill=gov_bin_f,shape=gov_bin_f,group=gov_bin_f))+geom_point(size=5)+geom_ribbon()

  glm_ckw_era1 <- glm(crisis_d~ state.r+executive.r+emo_pos.r+emo_neg.r+finance.r+external.r+economy.r+business.r+civic.r+weness.r+threat.r+temporal.r+people.r+information.r+action.r+negotiation.r+judgment.r+politics.r+up.r+down.r+development.r+capability.r+system.r+value.r,data=subset(df.cnc.terms,era=="1785-1899"),family=binomial(link="logit"))
  glm_ckw_era2 <- glm(crisis_d~ state.r+executive.r+emo_pos.r+emo_neg.r+finance.r+external.r+economy.r+business.r+civic.r+weness.r+threat.r+temporal.r+people.r+information.r+action.r+negotiation.r+judgment.r+politics.r+up.r+down.r+development.r+capability.r+system.r+value.r,data=subset(df.cnc.terms,era=="1900-1949"),family=binomial(link="logit"))
  glm_ckw_era3 <- glm(crisis_d~ state.r+executive.r+emo_pos.r+emo_neg.r+finance.r+external.r+economy.r+business.r+civic.r+weness.r+threat.r+temporal.r+people.r+information.r+action.r+negotiation.r+judgment.r+politics.r+up.r+down.r+development.r+capability.r+system.r+value.r,data=subset(df.cnc.terms,era=="1950-1999"),family=binomial(link="logit"))
  glm_ckw_era4 <- glm(crisis_d~ state.r+executive.r+emo_pos.r+emo_neg.r+finance.r+external.r+economy.r+business.r+civic.r+weness.r+threat.r+temporal.r+people.r+information.r+action.r+negotiation.r+judgment.r+politics.r+up.r+down.r+development.r+capability.r+system.r+value.r,data=subset(df.cnc.terms,era=="2000-2020"),family=binomial(link="logit"))

  ci_era1 <- confint(glm_ckw_era1) 
  ci_era2 <- confint(glm_ckw_era2)
  ci_era3 <- confint(glm_ckw_era3)
  ci_era4 <- confint(glm_ckw_era4)

  eff_era1 <- data.frame(category=rownames(ci_era1),lo=ci_era1[,1],est=glm_ckw_era1$coef,hi=ci_era1[,2],era="1785-1899")
  eff_era2 <- data.frame(category=rownames(ci_era2),lo=ci_era2[,1],est=glm_ckw_era2$coef,hi=ci_era2[,2],era="1900-1949")
  eff_era3 <- data.frame(category=rownames(ci_era3),lo=ci_era3[,1],est=glm_ckw_era3$coef,hi=ci_era3[,2],era="1950-1999")
  eff_era4 <- data.frame(category=rownames(ci_era4),lo=ci_era4[,1],est=glm_ckw_era4$coef,hi=ci_era4[,2],era="2000-2020")

  eff_era <- rbind(eff_era1,eff_era2,eff_era3,eff_era4)

  # ggplot(eff_era[c(-1),],aes(y=est,ymin=lo,ymax=hi,x=era,color=category))+geom_errorbar()+facet_wrap(.~category,scales="free_y")+geom_hline(yintercept=0)+theme_bluewhite()+theme(legend.position="none")

  for (i in 1:length(categories.r))
    {
      x <- categories.r[i]
      eff_era1[i] <- Effect(glm_ckw_era1,focal.predictor=x,xlevels=list(x=seq(0,50,5)))
      eff_era2[i] <- Effect(glm_ckw_era2,focal.predictor=x,xlevels=list(x=seq(0,50,5)))
      eff_era3[i] <- Effect(glm_ckw_era3,focal.predictor=x,xlevels=list(x=seq(0,50,5)))
      eff_era4[i] <- Effect(glm_ckw_era4,focal.predictor=x,xlevels=list(x=seq(0,50,5)))
    }

  glm_ckw1 <- list()
  glm_ckw2 <- list()
  glm_ckw3 <- list()
  glm_ckw4 <- list()

  df_ckw_eff1 <- list()
  df_ckw_eff2 <- list()
  df_ckw_eff3 <- list()
  df_ckw_eff4 <- list()

  for (i in 1:length(categories))
    {
      x <- (categories.r[i])
      df.cnc.terms$x <- df.cnc.terms[,x]
        df1 <- subset(df.cnc.terms,era=="1785-1899")
        df2 <- subset(df.cnc.terms,era=="1900-1949")
        df3 <- subset(df.cnc.terms,era=="1950-1999")
        df4 <- subset(df.cnc.terms,era=="2000-2020")
      glm_ckw1[[categories[i]]] <- glm(crisis_d~x,data=df1,family=binomial(link="logit"))
      glm_ckw2[[categories[i]]] <- glm(crisis_d~x,data=df2,family=binomial(link="logit"))
      glm_ckw3[[categories[i]]] <- glm(crisis_d~x,data=df3,family=binomial(link="logit"))
      glm_ckw4[[categories[i]]] <- glm(crisis_d~x,data=df4,family=binomial(link="logit"))

      df_ckw_eff1[[categories[i]]] <- data.frame(Effect(mod=glm_ckw1[[i]],focal.predictors=c("x"),xlevels=list(x=seq(0,100,10))))
      df_ckw_eff2[[categories[i]]] <- data.frame(Effect(mod=glm_ckw2[[i]],focal.predictors=c("x"),xlevels=list(x=seq(0,100,10))))
      df_ckw_eff3[[categories[i]]] <- data.frame(Effect(mod=glm_ckw3[[i]],focal.predictors=c("x"),xlevels=list(x=seq(0,100,10))))
      df_ckw_eff4[[categories[i]]] <- data.frame(Effect(mod=glm_ckw4[[i]],focal.predictors=c("x"),xlevels=list(x=seq(0,100,10))))
    }
    
  df2_ckw_eff <- data.frame(x=NA,fit=NA,se=NA,lower=NA,upper=NA)

  for (i in 1:24)
    {
      df2_ckw_eff <- rbind(df2_ckw_eff,df_ckw_eff1[[i]],df_ckw_eff2[[i]],df_ckw_eff3[[i]],df_ckw_eff4[[i]])
    }

  df2_ckw_eff <- subset(df2_ckw_eff,!is.na(x))
  df2_ckw_eff$era <- rep(rep(c("1785-1899", "1900-1949", "1950-1999", "2000-2020"),each=11),times=4)
  df2_ckw_eff$category <- rep(names(df_ckw_eff1),each=44)

  gg_probs_ckw <- ggplot(df2_ckw_eff,aes(y=fit,ymin=lower,ymax=upper,x=x,fill=era,color=era,group=era))+geom_ribbon(alpha=.3)+facet_wrap(.~category)+theme_bluewhite()+scale_color_viridis_d(begin=0,end=.85)+scale_fill_viridis_d(begin=0,end=.85)

  df.eff <- data.frame(category=NA,era=NA,lo=NA,est=NA,hi=NA)

  for (i in 7:length(categories))
    {
      df.eff[((i-1)*4)+1,"category"] <- categories[i]
      df.eff[((i-1)*4)+2,"category"] <- categories[i]
      df.eff[((i-1)*4)+3,"category"] <- categories[i]
      df.eff[((i-1)*4)+4,"category"] <- categories[i]
      df.eff[((i-1)*4)+1,"era"] <- "1785-1899"
      df.eff[((i-1)*4)+2,"era"] <- "1900-1949"
      df.eff[((i-1)*4)+3,"era"] <- "1950-1999"
      df.eff[((i-1)*4)+4,"era"] <- "2000-2020"
      ci1 <- confint(glm_ckw1[[i]])
      ci2 <- confint(glm_ckw2[[i]])
      ci3 <- confint(glm_ckw3[[i]])
      ci4 <- confint(glm_ckw4[[i]])
      est1 <- (glm_ckw1[[i]])$coef[2]
      est2 <- (glm_ckw2[[i]])$coef[2]
      est3 <- (glm_ckw3[[i]])$coef[2]
      est4 <- (glm_ckw4[[i]])$coef[2]
      df.eff[((i-1)*4)+1,c("lo", "est", "hi")] <- c(ci1[2,1],est1,ci1[2,2])
      df.eff[((i-1)*4)+2,c("lo", "est", "hi")] <- c(ci2[2,1],est2,ci2[2,2])
      df.eff[((i-1)*4)+3,c("lo", "est", "hi")] <- c(ci3[2,1],est3,ci3[2,2])
      df.eff[((i-1)*4)+4,c("lo", "est", "hi")] <- c(ci4[2,1],est4,ci4[2,2])
    }

  # ggplot(df.eff,aes(y=est,ymin=lo,ymax=hi,group=category,x=era,color=category,fill=category))+geom_point()+geom_ribbon(alpha=0.25)+geom_line()+geom_hline(yintercept=0,color="red")+facet_wrap(category~.)+theme_bluewhite()

  gg_eff_ckw <- ggplot(df.eff,aes(y=est,ymin=lo,ymax=hi,group=category,x=era,color=category,fill=category))+geom_point()+geom_ribbon(alpha=0.25)+geom_line()+geom_hline(yintercept=0,color="red")+facet_wrap(category~.,scales="free_y")+theme_bluewhite()+theme(legend.position="none")

  # ggsave(total_change_by_era,file="gg_total_change_by_era.svg",unit="cm",width=16,height=16,dpi=1200,scale=1.25)

  # ggsave(shares_change_by_era,file="gg_shares_change_by_era.svg",unit="cm",width=16,height=16,dpi=1200,scale=1.25)

  # ggsave(gg_probs_ckw,file="gg_probs_ckw.svg",unit="cm",width=16,height=16,dpi=1200,scale=1.25)

  # ggsave(gg_eff_ckw,file="gg_eff_ckw.svg",unit="cm",width=16,height=16,dpi=1200,scale=1.25)

  # ggplot(df.eff,aes(y=est,ymin=lo,ymax=hi,group=category,x=era,color=category,fill=category))+geom_point()+geom_ribbon(alpha=0.25)+geom_line()+geom_hline(yintercept=0,color="red")+theme_bluewhite()

  # ggplot(eff_era[c(-1),],aes(y=est,ymin=lo,ymax=hi,x=era,color=category))+geom_errorbar()+facet_wrap(.~category,scale="free_y")+geom_hline(yintercept=0,color="red")+theme_bluewhite()+theme(legend.position="none")

6.3 Gender differences

7 Appendices

7.1 Crisis Inflation Appendices

Code
#####
#####
##### Crisis inflation appendices.

#### Load packages

library(stringr)
library(erer)
library(ggplot2)
library(car)
library(tidyverse)
library(viridis)
library(ppcor)
library(stargazer)
library(DescTools)
library(car)


#### Define user-defined functions

# theme_bluewhite
theme_bluewhite <- function(base_size = 11, base_family = "Open Sans") {
  theme_bw() %+replace%
    theme(
      panel.grid.major = element_line(color = "lightskyblue2", linetype = "solid", size = .5),
      panel.grid.minor = element_line(color = "lightskyblue3", linetype = "dotted", size = .33),
      panel.background = element_rect(fill = "aliceblue"),
      panel.border = element_rect(color = "lightskyblue3", fill = NA),
      plot.background = element_rect(fill = "lightskyblue1"),
      axis.line = element_line(color = "lightskyblue3"),
      axis.ticks = element_line(color = "lightskyblue3"),
      axis.text = element_text(color = "black"),
      axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
      axis.title = element_text(face = "bold"),
      strip.background = element_rect(color = "gray50", fill = "steelblue"),
      strip.text = element_text(color = "white", face = "bold"),
      legend.background = element_rect(fill = alpha("lightskyblue1", .5), colour = "lightskyblue3")
    )
}

# theme_soft
theme_soft <- function(base_size = 11, base_family = "Open Sans") {
  theme_bw() %+replace%
    theme(
      panel.grid.major = element_line(color = "#ffffff", linetype = "solid", size = 0.5),
      panel.grid.minor = element_line(color = "#ffffff", linetype = "dotted", size = .33),
      panel.background = element_rect(fill = "#f6e0b5"),
      panel.border = element_rect(color = "#ffffff", fill = NA, size = 1.0),
      plot.background = element_rect(fill = "#fff4e6"),
      axis.line = element_line(color = "#ffffff", size = 1.0),
      axis.ticks = element_line(color = "#ffffff"),
      axis.text = element_text(color = "#000000"),
      axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
      axis.title = element_text(face = "bold"),
      strip.background = element_rect(color = "#967259", fill = "#be9b7b"),
      strip.text = element_text(color = "white", face = "bold"),
      legend.background = element_rect(fill = alpha("#f6e0b5", .5), colour = "#937342")
    )
}

setwd("a://onedrive - ntnu//2-data//crisis-collab")

load("a://onedrive - ntnu//2-data//crisis-collab//media_indicators.RData")
load("a://onedrive - ntnu//2-data//crisis-collab//crisis_rate_analysis.RData")
load("a://onedrive - ntnu//2-data//crisis-collab//df_bes_year.RData")
load("a://onedrive - ntnu//2-data//crisis-collab//sk10t.RData")
load("a://onedrive - ntnu//2-data//crisis-collab//e+times+wstar+nzz.RData")

### Appendix A: Correlation matrix

corvars <- c("year", "spending_GDP", "gini_rev_i", "Penetration", "media_autonomy_i", "ccORG1000", "CL_pcent", "CNW_pcent", "cnw")

cor(thetimes[, corvars]) # Zero-order correlations
pcor(thetimes[, corvars]) # partial correlations

### Appendix B: Models

# Table B1
stargazer(M1.2, M1.3, M1.4, M1.5, M1.6, M1.7, M1.1, type = "text", intercept.bottom = FALSE)
# Table B2
stargazer(M2.2, M2.3, M2.4, M2.5, M2.6, M2.7, M2.1, type = "text", intercept.bottom = FALSE)
# Table B3
stargazer(M3.2, M3.3, M3.4, M3.5, M3.6, M3.7, M3.1, type = "text", intercept.bottom = FALSE)

M4.4 <- lm(ccORG1000 ~ 1 + dyear, data = thetimes)
# Table B4
stargazer(M4.0, M4.1, M4.2, M4.3, M4.4, type = "text", intercept.bottom = FALSE)

### Appendix C: Comparing the strength of effects on CL salience vs CNW salience

### Appendix D: External data sources: Media Penetration

# No data to be included

### Appendix E: Potential drivers of crisis coverage

names(spend)[1] <- c("Year")
spend$Pop <- as.numeric(spend$Pop)
spend$nGDP <- as.numeric(gsub("\\.|\\,", "", spend$Nominal.GDP))
spend$nGDP[75:230] <- spend$nGDP[75:230] / 100
spend$rGDP <- as.numeric(gsub("\\,", "", spend$Real.GDP))
spend$Spending <- as.numeric(spend$Spending)
spend$SUMME <- rowSums(spend[, 84:101])
spend$Spending2 <- as.numeric(spend$SUMME)

spend$Economy <- rowSums(sapply(spend[, c(55, 60, 61, 62)], as.numeric), na.rm = T)
spend$Agriculture <- (sapply(spend[, 56], as.numeric))
spend$Resources <- rowSums(sapply(spend[, c(57, 58)], as.numeric), na.rm = T)
spend$Pensions <- rowSums(sapply(spend[, 4:7], as.numeric), na.rm = T)
spend$Health <- rowSums(sapply(spend[, 8:11], as.numeric), na.rm = T)
spend$Education <- rowSums(sapply(spend[, c(12:15, 54)], as.numeric), na.rm = T)
spend$Military <- rowSums(sapply(spend[, c(16:18, 20, 21)], as.numeric), na.rm = T)
spend$EconomicAid <- (sapply(spend[, 19], as.numeric))
spend$Family <- (sapply(spend[, 22], as.numeric))
spend$Unemployment <- (sapply(spend[, 23], as.numeric))
spend$Transport <- (sapply(spend[, 34], as.numeric))
spend$Religion <- (sapply(spend[, 53], as.numeric))
spend$Housing <- rowSums(sapply(spend[, c(24, 44, 45, 46, 47, 48, 49)], as.numeric), na.rm = T)
spend$Recreation <- rowSums(sapply(spend[, 50:52], as.numeric), na.rm = T)
spend$Environment <- rowSums(sapply(spend[, c(39, 40, 41, 42, 43)], as.numeric), na.rm = T)
spend$Government <- rowSums(sapply(spend[, c(35:38, 59)], as.numeric), na.rm = T)
spend$Welfare <- rowSums(sapply(spend[, c(25, 26, 27)], as.numeric), na.rm = T)
spend$Security <- rowSums(sapply(spend[, c(28, 29, 30, 31, 32, 33)], as.numeric), na.rm = T)

spend$Economy_Spending <- spend$Economy / spend$Spending2
spend$Agriculture_Spending <- spend$Agriculture / spend$Spending2
spend$Resources_Spending <- spend$Resources / spend$Spending2
spend$Pensions_Spending <- spend$Pensions / spend$Spending2
spend$Health_Spending <- spend$Health / spend$Spending2
spend$Education_Spending <- spend$Education / spend$Spending2
spend$Military_Spending <- spend$Military / spend$Spending2
spend$Family_Spending <- spend$Family / spend$Spending2
spend$Unemployment_Spending <- spend$Unemployment / spend$Spending2
spend$Transport_Spending <- spend$Transport / spend$Spending2
spend$Religion_Spending <- spend$Religion / spend$Spending2
spend$Housing_Spending <- spend$Housing / spend$Spending2
spend$Recreation_Spending <- spend$Recreation / spend$Spending2
spend$Environment_Spending <- spend$Environment / spend$Spending2
spend$Government_Spending <- spend$Government / spend$Spending2
spend$Welfare_Spending <- spend$Welfare / spend$Spending2
spend$Security_Spending <- spend$Security / spend$Spending2
spend$EconomicAid_Spending <- spend$EconomicAid / spend$Spending2

spend$Economy_nGDP <- spend$Economy / spend$nGDP
spend$Agriculture_nGDP <- spend$Agriculture / spend$nGDP
spend$Resources_nGDP <- spend$Resources / spend$nGDP
spend$Pensions_nGDP <- spend$Pensions / spend$nGDP
spend$Health_nGDP <- spend$Health / spend$nGDP
spend$Education_nGDP <- spend$Education / spend$nGDP
spend$Military_nGDP <- spend$Military / spend$nGDP
spend$Family_nGDP <- spend$Family / spend$nGDP
spend$Unemployment_nGDP <- spend$Unemployment / spend$nGDP
spend$Transport_nGDP <- spend$Transport / spend$nGDP
spend$Religion_nGDP <- spend$Religion / spend$nGDP
spend$Housing_nGDP <- spend$Housing / spend$nGDP
spend$Recreation_nGDP <- spend$Recreation / spend$nGDP
spend$Environment_nGDP <- spend$Environment / spend$nGDP
spend$Government_nGDP <- spend$Government / spend$nGDP
spend$Welfare_nGDP <- spend$Welfare / spend$nGDP
spend$Security_nGDP <- spend$Security / spend$nGDP
spend$EconomicAid_nGDP <- spend$EconomicAid / spend$nGDP

lspend <- melt(spend[, c(1, 104:119)], id.vars = c("Year"))
lspend.GDP <- melt(spend[, c(1, 121:137)], id.vars = c("Year"))
lspend.real <- melt(spend[, c(1, 71, 84:101)], id.vars = c("Year", "GDP.Deflator"))
lspend.real$real <- lspend.real$value / as.numeric(lspend.real$GDP.Deflator)
lspend.real$total <- rep(aggregate(lspend.real$real, by = list(lspend.real$Year), FUN = "sum")$x, times = 18)

gdpdeflator <- data.frame(year = c(), deflator = spend$rGDP / spend$nGDP)

ggplot(lspend.real, aes(y = real, x = Year, label = variable, fill = variable, group = variable, color = variable)) +
  geom_area() +
  scale_fill_viridis_d() +
  scale_color_viridis_d() +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  ylab("UK Government Spending [in 2016 pounds]")

ggplot(lspend, aes(y = value, x = Year, fill = variable, group = variable, color = variable)) +
  geom_area(color = "white") +
  scale_x_continuous(name = "Year", breaks = seq(1780, 2020, 10))


ggplot(lspend.GDP, aes(y = value, x = Year, fill = variable, group = variable, color = variable)) +
  geom_area() +
  scale_x_continuous(name = "Year", breaks = seq(1780, 2020, 10))

ggplot(spend, aes(x = Year, y = Pop)) +
  geom_point() +
  geom_smooth()
ggplot(spend, aes(x = Year, y = log(nGDP + 1))) +
  geom_point() +
  geom_smooth()

gg.spending.real <- ggplot(lspend.real, aes(y = real, x = Year, label = variable, fill = variable, group = variable, color = variable)) +
  geom_area() +
  scale_fill_viridis_d() +
  scale_color_viridis_d() +
  theme_bluewhite() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  ylab("Amount of spending (adjusted)")

lspend.real$SpendingCategory <- lspend.real$variable
lspend.real$prop <- 100 * lspend.real$real / lspend.real$total
lspend.real$gini <- NA

for (i in min(lspend.real$Year):max(lspend.real$Year)) {
  lsp <- subset(lspend.real, Year == i)
  lsp$gini <- Gini(lsp$prop)
  lspend.real[lspend.real$Year == i, "gini"] <- lsp$gini[1]
}

gg.rspending.gini <- ggplot(subset(lspend.real, variable == "Economy"), aes(y = 1 - gini, x = Year)) +
  geom_area() +
  scale_fill_viridis_d() +
  scale_color_viridis_d() +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  ylab("Diversity of spending categories (1-Gini)")



gg.rspending.real <- ggplot(lspend.real, aes(y = 100 * real / total, x = Year, label = SpendingCategory, fill = SpendingCategory, group = SpendingCategory, color = SpendingCategory)) +
  geom_area() +
  scale_fill_viridis_d() +
  scale_color_viridis_d() +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  ylab("Share of total government spending")

ggsave(gg.rspending.real, file = "Government_spending_shares.svg", unit = "cm", width = 16, height = 10, dpi = 1200, scale = 1.35)

lspend.GDP$SpendingCategory <- str_remove(string = lspend.GDP$variable, pattern = "_nGDP")

gg.rspending.GDP <- ggplot(lspend.GDP, aes(y = 100 * value, x = Year, label = SpendingCategory, fill = SpendingCategory, group = SpendingCategory, color = SpendingCategory)) +
  geom_area() +
  scale_fill_viridis_d() +
  scale_color_viridis_d() +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  ylab("Gov. spending as % of GDP")

gg.rspending.GDPtotal <- ggplot(lspend.GDP, aes(y = 100 * value, x = Year, fill = SpendingCategory, group = SpendingCategory, color = SpendingCategory)) +
  geom_area(fill = "black", color = "black") +
  scale_fill_viridis_d() +
  scale_color_viridis_d() +
  theme_soft() +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  ylab("Gov. spending as % of GDP")

votersUK <- vdemuk[, c("year", "v2x_suffr")]

gg_voter_suffrage <- ggplot(votersUK, aes(y = v2x_suffr, x = year)) +
  geom_area() +
  theme_soft() +
  xlab("Year") +
  ylab("Percentage of adults with suffrage in the UK") +
  scale_x_continuous(limits = c(1790, 2020), breaks = seq(1790, 2020, 10))



# Figure E1

ggsave(gg.rspending.GDPtotal, file = "gov_spend_int.png", unit = "cm", width = 16, height = 6, dpi = 1200, scale = 1.50)
ggsave(gg.rspending.GDPtotal, file = "gov_spend_int.svg", unit = "cm", width = 16, height = 6, dpi = 1200, scale = 1.50)

# ggsave(gg.rspending.GDP,file="gov_spend_int_cat.png",unit="cm",width=16,height=6,dpi=1200,scale=2.25)

# Figure E2
ggsave(gg.rspending.gini, file = "gov_spend_div.png", unit = "cm", width = 16, height = 6, dpi = 1200, scale = 1.50)
ggsave(gg.rspending.gini, file = "gov_spend_div.svg", unit = "cm", width = 16, height = 6, dpi = 1200, scale = 1.50)

# Figure E3
ggsave(gg_voter_suffrage, file = "voter_suffrage.svg", unit = "cm", width = 16, height = 6, dpi = 1200, scale = 1.50)

# Figure E4
gg_media_penetration <- ggplot(thetimes, aes(x = year, y = Penetration)) +
  geom_area() +
  theme_soft()
ggsave(gg_media_penetration, file = "media_penetration.png", unit = "cm", width = 16, height = 6, dpi = 1200, scale = 2.00)
ggsave(gg_media_penetration, file = "media_penetration.svg", unit = "cm", width = 16, height = 6, dpi = 1200, scale = 2.00)

# Figure E5
gg_media_autonomy <- ggplot(thetimes, aes(x = year, y = media_autonomy_i)) +
  geom_area() +
  theme_soft()
ggsave(gg_media_autonomy, file = "media_autonomy.png", unit = "cm", width = 16, height = 6, dpi = 1200, scale = 2.00)
ggsave(gg_media_autonomy, file = "media_autonomy.svg", unit = "cm", width = 16, height = 6, dpi = 1200, scale = 2.00)

# Figure E6
co_vdem <- data.frame(
  year = c(vdemru$year, vdemno$year, vdemus$year, vdemuk$year, vdemde$year, vdemch$year, vdemnl$year, vdemcn$year),
  media_autonomy = c(vdemru$media_autonomy, vdemno$media_autonomy, vdemus$media_autonomy, vdemuk$media_autonomy, vdemde$media_autonomy, vdemch$media_autonomy, vdemnl$media_autonomy, vdemcn$media_autonomy),
  country = rep(c("RU", "NO", "US", "UK", "DE", "CH", "NL", "CN"), times = c(dim(vdemru)[1], dim(vdemno)[1], dim(vdemus)[1], dim(vdemuk)[1], dim(vdemde)[1], dim(vdemch)[1], dim(vdemnl)[1], dim(vdemcn)[1]))
)


co_vdem$minimum <- min(co_vdem$media_autonomy, na.rm = TRUE)
co_vdem$maximum <- max(co_vdem$media_autonomy, na.rm = TRUE)
co_vdem$media_autonomy_bi <- 100 * (co_vdem$media_autonomy - co_vdem$minimum) / (co_vdem$maximum - co_vdem$minimum)

gg_vdem <- ggplot(subset(co_vdem, country %in% c("UK", "DE", "RU", "US")), aes(x = year, y = media_autonomy_bi, group = country, color = country, size = country, fill = country, shape = country, linewidth = country)) +
  geom_point() +
  geom_smooth(linewidth = 0.5, se = FALSE, span = .15) +
  theme_soft() +
  ylim(0, 100) +
  scale_linetype_manual(values = c("dotted", "dotted", "dotted", "dotted", "dotted", "dotted", "solid", "dotted")) +
  scale_shape_manual(values = c(0, 0, 15, 0)) +
  scale_size_manual(values = c(1, 1, 2, 1)) +
  scale_color_viridis_d(option = "inferno", begin = .2, end = .8) +
  scale_fill_viridis_d(option = "inferno", begin = .2, end = .8) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  scale_x_continuous(breaks = seq(1780, 2020, 10))

ggsave(gg_vdem, file = "vdem_ma.png", unit = "cm", scale = 2.00, dpi = 1200, width = 16, height = 6)
ggsave(gg_vdem, file = "vdem_ma.svg", unit = "cm", scale = 2.00, dpi = 1200, width = 16, height = 6)


### Appendix G: Comparative analysis in Times, Economist, NZZ, and Washington Star

df_total # Data for G1
df_crisisarticles # Data for G2
df_crisisshare # Data for G3
df_wavearticles # Data for G4
df_waveshare # Data for G5
df_waves # Data for G6

## PNG
ggsave(gg_year_waves, file = "wave_count_4np.png", unit = "cm", width = 16, height = 16, dpi = 1200, scale = 1.25)
ggsave(gg_year_waveshare, file = "wave_share_4np.png", unit = "cm", width = 16, height = 16, dpi = 1200, scale = 1.25)
ggsave(gg_wave_duration, file = "wave_duration_4np.png", unit = "cm", width = 16, height = 16, dpi = 1200, scale = 1.25)
ggsave(gg_year_clshare, file = "cl_share_4np.png", unit = "cm", width = 16, height = 16, dpi = 1200, scale = 1.25)
ggsave(gg_year_clarticles, file = "cl_articles_4np.png", unit = "cm", width = 16, height = 16, dpi = 1200, scale = 1.25)
ggsave(gg_year_articles, file = "articles_4np.png", unit = "cm", width = 16, height = 16, dpi = 1200, scale = 1.25)
ggsave(gg_year_wavearticles, file = "wave_articles_4np.png", unit = "cm", width = 16, height = 16, dpi = 1200, scale = 1.25)

## SVG
ggsave(gg_year_articles, file = "articles_4np.svg", device = "svg", unit = "cm", width = 16, height = 16, scale = 1.25, dpi = 1200) # Figure G1
ggsave(gg_year_clarticles, file = "cl_articles_4np.svg", device = "svg", unit = "cm", width = 16, height = 16, scale = 1.25, dpi = 1200) # Figure G2
ggsave(gg_year_clshare, file = "cl_share_4np.svg", device = "svg", unit = "cm", width = 16, height = 16, scale = 1.25, dpi = 1200) # Figure G3
ggsave(gg_year_wavearticles, file = "wave_articles_4np.svg", device = "svg", unit = "cm", width = 16, height = 16, scale = 1.25, dpi = 1200) # Figure G4
ggsave(gg_year_waveshare, file = "wave_share_4np.svg", device = "svg", unit = "cm", width = 16, height = 16, scale = 1.25, dpi = 1200) # Figure G5
ggsave(gg_year_waves, file = "wave_count_4np.svg", device = "svg", unit = "cm", width = 16, height = 16, scale = 1.25, dpi = 1200) # Figure G6

### Appendix H: Sensitivity analysis

# Figure H1

### Appendix I: The Times in context

newsvolume <- data.frame(year = thetimes$year, articles = thetimes$articles)

editors <- data.frame(
  start = c(1785, 1803, 1812, 1817, 1841, 1877, 1884, 1912, 1919, 1919, 1923, 1941, 1948, 1952, 1967, 1981, 1982, 1985, 1990, 1992, 2002, 2007, 2013),
  end = c(1803, 1812, 1817, 1841, 1877, 1884, 1912, 1919, 1919, 1923, 1941, 1948, 1952, 1967, 1981, 1982, 1985, 1990, 1992, 2002, 2007, 2013, 2020),
  editor = c("Walter I", "Walter II", "Stoddard", "Barnes", "Delane", "Chenery", "Buckle", "Dawson", "Freeman", "Steed", "Dawson", "Barrington-Ward", "Casey", "Haley", "Rees-Mogg", "Evans", "Douglas-Home", "Wilson", "Jenkins", "Stothard", "Thomson", "Harding", "Witherow")
)

owners <- data.frame(
  start = c(1785, 1803, 1847, 1894, 1908, 1922, 1959, 1966, 1976, 1981),
  end = c(1803, 1847, 1894, 1908, 1922, 1959, 1966, 1976, 1981, 2020),
  owner = c("Walter I", "Walter II", "Walter III", "Walter IV", "Harmsworth", "Astor I", "Astor II", "Thomson I", "Thomson II", "Murdoch")
)

events <- data.frame(
  start = c(1803, 1914, 1939, 1935, 1887, 1920, 1814, 1844, 1838, 1848, 1860, 1978),
  end = c(1815, 1918, 1945, 1938, 1888, 1921, 1815, 1845, 1839, 1870, 1866, 1979),
  event = c("Napoleonic Wars", "World War I", "World War II", "Appeasement", "Piggott forgeries", "Zion hoax", "Hi-speed steam press", "Rotary press", "London-Birmingham Postal Railway", "Telegraph network established", "'Walter Press'", "Strike")
)

prices <- data.frame(
  year = c(1788, 1797, 1805, 1814, 1824, 1838, 1847, 1857, 1865, 1873, 1882, 1890, 1907, 1921, 1931, 1939, 1950, 1973, 1980, 1990, 2000, 2007, 2014, 2020),
  price = c(3 / 240, 6 / 240, 6 / 240, 6.5 / 240, 7 / 240, 5 / 240, 5 / 240, 4 / 240, 3 / 240, 3 / 240, 3 / 240, 3 / 240, 3 / 240, 5 / 240, 4 / 240, 2.5 / 240, 3 / 240, 3 / 100, 20 / 100, 35 / 100, 35 / 100, 65 / 100, 120 / 100, 220 / 100)
)

spend$GDP_deflator_i <- as.numeric(c(rep(1.41, times = 11), spend$GDP.Deflator[12:230], rep(106.00, times = 2)))

prices$realprice <- 100 * prices$price / spend[match(prices$year, spend$Year), "GDP_deflator_i"]

circulation <- data.frame(
  year = c(1815, 1852, 1910, 1921, 1930, 1939, 1947, 1956, 1966, 1976, 1980, 1992, 2000, 2005, 2010, 2015, 2019),
  circulation = c(5000, 42384, 45000, 113000, 187000, 204000, 268000, 220716, 282000, 310000, 297000, 386258, 726349, 686327, 508250, 396621, 417298)
)


# Figure I1
coverage.volume.trajectory <- ggplot() +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "white", size = 1) +
  geom_point(data = newsvolume, aes(y = articles, x = year), color = "black") +
  geom_smooth(data = newsvolume, aes(y = articles, x = year), color = "dodgerblue") +
  geom_point(data = circulation, aes(y = circulation / 7, x = year), shape = 15, size = 4, color = "#888888") +
  geom_point(data = prices, aes(y = realprice * 50000, x = year), shape = 16, size = 4, color = "hotpink") +
  geom_text(data = prices, aes(y = realprice * 50000, x = year, label = round(realprice, 2)), size = 4, color = "hotpink", nudge_y = 4000) +
  geom_text(data = circulation, aes(y = circulation / 7, x = year + 2, label = circulation), color = "#888888", hjust = 0) +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  theme_soft() +
  theme(legend.position = "none") +
  annotate("text", size = 5, x = c(1850, 1930, 1825), y = c(25000, 0, 120000), color = c("black", "darkgrey", "hotpink"), label = c("total stories published", "total circulation", "price (inflation adjusted)"), fontface = "bold")
coverage.volume.trajectory

ggsave(coverage.volume.trajectory, file = "circulation.svg", unit = "cm", width = 16, height = 10, scale = 1.75, dpi = 1200)

# ggsave(coverage.volume.trajectory,file="cov_vol_traj.svg",unit="cm",width=16,height=12,scale=1.5)

# Figure I2
backgrounds <- ggplot() +
  geom_vline(xintercept = c(1800, 1850, 1900, 1950, 2000), color = "white", size = 1) +
  geom_rect(data = editors, aes(ymin = 1000, ymax = 3000, xmin = start, xmax = end, fill = editor), color = "white") +
  geom_text(data = editors, aes(y = c(rep(c(-1000, -7000, -4000, -10000), times = 5), -7000, -10000, -4000), x = start, label = editor), hjust = 0) +
  geom_rect(data = owners, aes(ymin = 40000, ymax = 42000, xmin = start, xmax = end, fill = owner), color = "white") +
  geom_text(data = owners, aes(y = c(rep(c(38000, 32000, 35000, 29000), times = 2), 38000, 32000), x = start, label = owner), hjust = 0) +
  geom_rect(data = events, aes(ymin = 20000, ymax = 25000, xmin = start, xmax = end, fill = event), color = "white") +
  geom_text(data = events, aes(y = c(18000, 18000, 18000, 15000, 15000, 15000, 12000, 18000, 9000, 6000, 12000, 12000), x = start, label = event), hjust = 0) +
  scale_x_continuous(breaks = seq(1780, 2020, 10)) +
  theme_soft() +
  theme(legend.position = "none", axis.text.y = element_text(color = "white")) +
  ylab("Events")
backgrounds

ggsave(backgrounds, file = "backgrounds.svg", unit = "cm", width = 16, height = 8, scale = 2.25)

### Appendix J: Check for potential circularity of crisis frame sponsor measurement

organization_composition_data <- data.frame(
  year = c(1785:2020),
  category = c(
    rep(0, times = 25), 230, 0, 111, 230, 230, 230, 0, 230, 150, 230, 230, 0, 230, 230, 130, 230, 132, 150, 9000, 1410, 1230, 230, 230, 230, 230, 230, 230, 230, 230, 230, 230, 230, 132, 230, 230, 230, 150, 140, 1530, 700, 111, # 1850
    131, 430, 132, 1530, 1427, 230, 132, 132, 1410, 132, 230, 1230, 131, 1410, 1427, 1530, 1530, 131, 1530, 150, 230, 1530, 133, 131, 330, 1427, 230, 230, 1700, 131, 1410, 1530, 230, 115, 800, 230, 230, 130, 1530, 1422, 132, 230, 131, 1411, 1530, 230, 430, 1530, 230, # 1900
    1700, 230, 1530, 9000, 1417, 1530, 1427, 1700, 132, 1230, 1230, 1530, 133, 1427, 133, 132, 1700, 132, 1230, 132, 133, 1700, 1610, 1530, 1230, 800, 132, 111, 1300, 1530, 1530, 230, 1411, 700, 1530, 132, 1415, 133, 230, 430, 230, 230, 1417, 130, 1415, 1530, 1700, 1412, 110, 130, 1427, # 1950
    1700, 132, 132, 230, 110, 132, 132, 1230, 133, 110, 1530, 1410, 1300, 132, 1413, 1530, 510, 1530, 1415, 230, 1100, 1300, 230, 1412, 510, 132, 1530, 110, 140, 111, 1427, 1427, 810, 230, 510, 800, 810, 1100, 1610, 720, 400, 330, 1415, 150, 1410, 1415, 510, 132, 112, 1427, # 2000
    1400, 320, 111, 130, 820, 1300, 510, 1416, 1427, 1417, 1427, 150, 120, 1415, 600, 510, 820, 1427, 1000, 820
  )
)

organization_composition_wdata2 <- pivot_wider(organization_composition_data, id_cols = year, names_from = category, values_from = category)

organization_composition_wdata <- pivot_wider(organization_composition_data, id_cols = year, names_from = category, values_from = category)

organization_composition_wdata[, 2:43] <- replace(organization_composition_wdata[, 2:43], is.na(organization_composition_wdata[, 2:43]), 0)

organization_composition_wdata[, 2:43] <- replace(organization_composition_wdata[, 2:43], (organization_composition_wdata[, 2:43]) > 0, 1)

for (i in 2:236) {
  for (j in 2:43) {
    organization_composition_wdata[i, j] <- ifelse(organization_composition_wdata[i, j] == 1, organization_composition_wdata[i - 1, j] + 1,
      ifelse(organization_composition_wdata[i - 1, j] > 0.09, organization_composition_wdata[i - 1, j] - 0.25, 0)
    )
  }
}

organization_composition_wdata$Gini <- NA

for (i in 1:236)
{
  organization_composition_wdata[i, "Gini"] <- Gini(organization_composition_wdata[i, 2:43])
}

gg_org_comp <- ggplot(organization_composition_wdata, aes(x = year, y = 1 - Gini)) +
  geom_point() +
  geom_smooth(span = .25) +
  theme_soft() +
  xlab("Year") +
  ylab("Diversity of actors (1-Gini)") +
  scale_x_continuous(breaks = seq(1780, 2020, 20))

# Figure J1
# PNG
ggsave(gg_org_comp, file = "org_comp.png", unit = "cm", width = 16, height = 8, dpi = 1200, scale = 1.25)

# SVG
ggsave(gg_org_comp, file = "org_comp.svg", unit = "cm", width = 16, height = 8, dpi = 1200, scale = 1.25)

### Appendix K: Details on search string generation and validation

# No data to report

### Appendix L: Details on text processing

# No data to report

### Appendix M: Details on structural topic modeling

sk10t$var <- Recode(sk10t$variable, "'exclus'='Exclusivity';'semcoh'='Semantic coherence';'heldout'='Heldout likelihood';'residual'='Residual';'bound'='Bound';'lbound'='Lower Bound';'em.its'='Iterations'")

search_for_K <- ggplot(subset(sk10t, variable %in% c("exclus", "semcoh", "heldout", "residual")), aes(x = K, y = value)) +
  geom_point() +
  geom_line() +
  geom_smooth(color = "darkslategray4") +
  geom_vline(xintercept = 250, color = "red") +
  facet_wrap(. ~ var, scales = "free_y") +
  theme_soft() +
  ylab("Value")

# Figure M1
ggsave(search_for_K, file = "search_for_K.svg", dpi = 1200, scale = 1.5, unit = "cm", width = 16, height = 10)

### Appendix N: Details on automatic crisis news wave (CNW) detection

### Appendix O: Details on the Named Entity Recognition procedure

### Appendix P: Calculation of government spending diversity













### LMERs

df_waveshare$cnwv_share100 <- 100 * df_waveshare$cnwv_share

lmer_cnw_share <- lmer(cnwv_share100 ~ 1 + year + (1 | outlet), data = df_waveshare)

### Government spending diversity calculation.

spending_share_vars <- names(spend)[str_detect(names(spend), ".*_Spending")]

table_spending_categories_year <- spend[spend$Year %in% c(1800, 1850, 1900, 1950, 2000, 2020), c("Year", spending_share_vars)]

table_spending_categories_year2 <- spend[spend$Year %in% c(1800, 1850, 1900, 1950, 2000, 2020), c(spending_share_vars)]

w <- rep(1 / 17, times = 17)
wm <- 1 / 17

x1 <- table_spending_categories_year2[1, ][order(table_spending_categories_year2[1, ], decreasing = FALSE)]
x2 <- table_spending_categories_year2[2, ][order(table_spending_categories_year2[2, ], decreasing = FALSE)]
x3 <- table_spending_categories_year2[3, ][order(table_spending_categories_year2[3, ], decreasing = FALSE)]
x4 <- table_spending_categories_year2[4, ][order(table_spending_categories_year2[4, ], decreasing = FALSE)]
x5 <- table_spending_categories_year2[5, ][order(table_spending_categories_year2[5, ], decreasing = FALSE)]
x6 <- table_spending_categories_year2[6, ][order(table_spending_categories_year2[6, ], decreasing = FALSE)]

f.hat <- w / 2 + c(0, head(cumsum(w), -1))

res1 <- 2 / wm * sum(w * (x1 - wm) * (f.hat - mean(f.hat)))
res2 <- 2 / wm * sum(w * (x2 - wm) * (f.hat - mean(f.hat)))
res3 <- 2 / wm * sum(w * (x3 - wm) * (f.hat - mean(f.hat)))
res4 <- 2 / wm * sum(w * (x4 - wm) * (f.hat - mean(f.hat)))
res5 <- 2 / wm * sum(w * (x5 - wm) * (f.hat - mean(f.hat)))
res6 <- 2 / wm * sum(w * (x6 - wm) * (f.hat - mean(f.hat)))

res1x <- res1 * 1 / (1 - sum(w^2))

i.gini <- function(x, w, unbiased = FALSE) {
  w <- w / sum(w)
  x <- x1[id <- order(x1)]
  w <- w[id]
  f.hat <- w / 2 + c(0, head(cumsum(w), -1))
  wm <- Mean(x, w)
  res <- 2 / wm * sum(w * (x - wm) * (f.hat - Mean(
    f.hat,
    w
  )))
  if (unbiased) {
    res <- res * 1 / (1 - sum(w^2))
  }
  return(res)
}


2 * my * (sum(x - my))

sum(2 * (1:17) - 17 - 1 * (100 * table_spending_categories_year2[1, ][order(table_spending_categories_year2[1, ], decreasing = TRUE)])) / (17 * sum((100 * table_spending_categories_year2[1, ][order(table_spending_categories_year2[1, ], decreasing = TRUE)])))

2 * sum(1:17 * (100 * table_spending_categories_year2[1, ][order(table_spending_categories_year2[1, ], decreasing = TRUE)] - 100 / 17)) / (17 * 17 * 100 / 17)
2 * sum(1:17 * (100 * table_spending_categories_year2[2, ][order(table_spending_categories_year2[2, ], decreasing = TRUE)] - 100 / 17)) / (17 * 17 * 100 / 17)
2 * sum(1:17 * (100 * table_spending_categories_year2[3, ][order(table_spending_categories_year2[3, ], decreasing = TRUE)] - 100 / 17)) / (17 * 17 * 100 / 17)
2 * sum(1:17 * (100 * table_spending_categories_year2[4, ][order(table_spending_categories_year2[4, ], decreasing = TRUE)] - 100 / 17)) / (17 * 17 * 100 / 17)
2 * sum(1:17 * (100 * table_spending_categories_year2[5, ][order(table_spending_categories_year2[5, ], decreasing = TRUE)] - 100 / 17)) / (17 * 17 * 100 / 17)
2 * sum(1:17 * (100 * table_spending_categories_year2[6, ][order(table_spending_categories_year2[6, ], decreasing = TRUE)] - 100 / 17)) / (17 * 17 * 100 / 17)

2 * sum(1:17 * 100 * rep(100 / 17, times = 17) - 100 / 17) / (17 * 17 * 5.88)

sum(abs(100 * table_spending_categories_year2[1, ] - 100 / 17)) / (2 * 17 * 17 * 100 / 17)
sum(abs(100 * table_spending_categories_year2[2, ] - 100 / 17)) / (2 * 17 * 17 * 100 / 17)
sum(abs(100 * table_spending_categories_year2[6, ] - 100 / 17)) / (2 * 17 * 17 * 100 / 17)
sum(abs(100 * table_spending_categories_year2[6, ] - 100 / 17)) / (2 * 17 * 17 * 100 / 17)
sum(abs(100 * table_spending_categories_year2[6, ] - 100 / 17)) / (2 * 17 * 17 * 100 / 17)

1 / (100 * 100 * 5.88) * (97 * (2 * 1 - 17 - 1)) + (3 * (2 * 2 - 17 - 1))